(index) Experimental initial integration of document spans into index

This commit is contained in:
Viktor Lofgren 2024-07-30 12:01:53 +02:00
parent 80900107f7
commit b316b55be9
29 changed files with 394 additions and 162 deletions

View File

@ -59,13 +59,4 @@ public class CompiledQueryAggregates {
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
}
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
public static <T> LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
}

View File

@ -1,85 +0,0 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToLongFunction;
import java.util.function.LongUnaryOperator;
import java.util.function.ToLongFunction;
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
private final IntToLongFunction operator;
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) {
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
@Override
public LongSet onAnd(List<? extends CqExpression> parts) {
LongSet ret = new LongArraySet();
for (var part : parts) {
ret = comineSets(ret, part.visit(this));
}
return ret;
}
private LongSet comineSets(LongSet a, LongSet b) {
if (a.isEmpty())
return b;
if (b.isEmpty())
return a;
LongSet ret = newSet(a.size() * b.size());
var ai = a.longIterator();
while (ai.hasNext()) {
long aval = ai.nextLong();
var bi = b.longIterator();
while (bi.hasNext()) {
ret.add(aval & bi.nextLong());
}
}
return ret;
}
@Override
public LongSet onOr(List<? extends CqExpression> parts) {
LongSet ret = newSet(parts.size());
for (var part : parts) {
ret.addAll(part.visit(this));
}
return ret;
}
@Override
public LongSet onLeaf(int idx) {
var set = newSet(1);
set.add(operator.applyAsLong(idx));
return set;
}
/** Allocate a new set suitable for a collection with the provided cardinality */
private LongSet newSet(int cardinality) {
if (cardinality < 8)
return new LongArraySet(cardinality);
else
return new LongOpenHashSet(cardinality);
}
}

View File

@ -17,6 +17,7 @@ dependencies {
implementation project(':code:libraries:btree')
implementation project(':code:libraries:slop')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')

View File

@ -1,6 +1,6 @@
package nu.marginalia.index.forward;
class ForwardIndexParameters {
public class ForwardIndexParameters {
public static final int ENTRY_SIZE = 3;
public static final int METADATA_OFFSET = 0;
public static final int FEATURES_OFFSET = 1;

View File

@ -3,11 +3,14 @@ package nu.marginalia.index.forward;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;
@ -30,6 +33,7 @@ public class ForwardIndexReader {
private final LongArray data;
private final ForwardIndexSpansReader spansReader;
private final Logger logger = LoggerFactory.getLogger(getClass());
public ForwardIndexReader(Path idsFile,
@ -121,6 +125,21 @@ public class ForwardIndexReader {
return idToOffset.get(docId);
}
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return new DocumentSpans();
long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
try {
return spansReader.readSpans(arena, encodedOffset);
}
catch (IOException ex) {
logger.error("Failed to read spans for doc " + docId, ex);
return new DocumentSpans();
}
}
public int totalDocCount() {
return idToOffset.size();

View File

@ -1,9 +1,11 @@
package nu.marginalia.index.forward;
package nu.marginalia.index.forward.construction;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexParameters;
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;

View File

@ -0,0 +1,77 @@
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;
public class DocumentSpan {
/** A list of the interlaced start and end positions of each span in the document of this type */
private final CodedSequence startsEnds;
public DocumentSpan(CodedSequence startsEnds) {
this.startsEnds = startsEnds;
}
public DocumentSpan() {
this.startsEnds = null;
}
public boolean containsPosition(int position) {
if (startsEnds == null) {
return false;
}
var iter = startsEnds.iterator();
while (iter.hasNext()) {
int start = iter.nextInt();
if (start > position) {
return false;
}
int end = iter.nextInt();
if (end > position) {
return true;
}
}
return false;
}
public boolean containsRange(int rangeStart, int len) {
if (startsEnds == null) {
return false;
}
var iter = startsEnds.iterator();
while (iter.hasNext()) {
int start = iter.nextInt();
if (start > rangeStart) {
return false;
}
int end = iter.nextInt();
if (end > rangeStart + len) {
return true;
}
}
return false;
}
public boolean overlapsRange(CodedSequence sequence) {
return SequenceOperations.intersectSequences(iterator(), sequence.iterator());
}
/** Returns an iterator over the start and end positions of each span in the document of this type */
public IntIterator iterator() {
if (null == startsEnds) {
return IntList.of().iterator();
}
return startsEnds.iterator();
}
public int size() {
return startsEnds.valueCount() / 2;
}
}

View File

@ -0,0 +1,35 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.sequence.CodedSequence;
public class DocumentSpans {
private static DocumentSpan EMPTY_SPAN = new DocumentSpan();
public DocumentSpan title = EMPTY_SPAN;
public DocumentSpan heading = EMPTY_SPAN;
public DocumentSpan nav = EMPTY_SPAN;
public DocumentSpan pageHeader = EMPTY_SPAN;
public DocumentSpan pageFooter = EMPTY_SPAN;
public DocumentSpan code = EMPTY_SPAN;
public DocumentSpan pre = EMPTY_SPAN;
void accept(byte code, CodedSequence positions) {
if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions);
else if (code == HtmlTag.TITLE.code)
this.title = new DocumentSpan(positions);
else if (code == HtmlTag.NAV.code)
this.nav = new DocumentSpan(positions);
else if (code == HtmlTag.PAGE_HEADER.code)
this.pageHeader = new DocumentSpan(positions);
else if (code == HtmlTag.PAGE_FOOTER.code)
this.pageFooter = new DocumentSpan(positions);
else if (code == HtmlTag.CODE.code)
this.code = new DocumentSpan(positions);
else if (code == HtmlTag.PRE.code)
this.pre = new DocumentSpan(positions);
}
}

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.forward;
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException;
@ -9,8 +8,6 @@ import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.List;
@SuppressWarnings("preview")
public class ForwardIndexSpansReader implements AutoCloseable {
@ -20,9 +17,9 @@ public class ForwardIndexSpansReader implements AutoCloseable {
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
}
public List<SpanData> readSpans(Arena arena, long encodedOffset) throws IOException {
long size = encodedOffset & 0xFFF_FFFF;
long offset = encodedOffset >>> 28;
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
long size = SpansCodec.decodeSize(encodedOffset);
long offset = SpansCodec.decodeStartOffset(encodedOffset);
var buffer = arena.allocate(size).asByteBuffer();
buffer.clear();
@ -33,22 +30,16 @@ public class ForwardIndexSpansReader implements AutoCloseable {
int count = buffer.get();
List<SpanData> ret = new ArrayList<>();
DocumentSpans ret = new DocumentSpans();
while (count-- > 0) {
byte code = buffer.get();
short len = buffer.getShort();
final int pos = buffer.position();
// Decode the gamma-coded sequence; this will advance the buffer position
// in a not entirely predictable way, so we need to save the position
buffer.limit(buffer.position() + len);
var sequence = new GammaCodedSequence(buffer).values();
ret.add(new SpanData(code, sequence));
ret.accept(code, new GammaCodedSequence(buffer.slice(buffer.position(), len)));
// Reset the buffer position to the end of the span
buffer.position(pos + len);
buffer.limit(buffer.capacity());
buffer.position(buffer.position() + len);
}
return ret;
@ -59,5 +50,4 @@ public class ForwardIndexSpansReader implements AutoCloseable {
spansFileChannel.close();
}
public record SpanData(byte code, IntList data) {}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.index.forward;
package nu.marginalia.index.forward.spans;
import java.io.IOException;
import java.nio.ByteBuffer;
@ -42,8 +42,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
}
public long endRecord() {
return stateStartOffset << 28 | stateLength;
return SpansCodec.encode(stateStartOffset, stateLength);
}
@Override

View File

@ -0,0 +1,17 @@
package nu.marginalia.index.forward.spans;
public class SpansCodec {
public static long encode(long startOffset, long size) {
assert size < 0x1000_0000L : "Size must be less than 2^28";
return startOffset << 28 | (size & 0xFFF_FFFFL);
}
public static long decodeStartOffset(long encoded) {
return encoded >>> 28;
}
public static long decodeSize(long encoded) {
return encoded & 0x0FFF_FFFFL;
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.index.forward;
import lombok.SneakyThrows;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.model.id.UrlIdCodec;

View File

@ -1,6 +1,7 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
@ -11,7 +12,7 @@ import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.*;
class ForwardIndexSpansReaderTest {
Path testFile = Files.createTempFile("test", ".idx");
@ -32,12 +33,12 @@ class ForwardIndexSpansReaderTest {
long offset2;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan((byte) 'a', GammaCodedSequence.generate(wa, 1, 3, 5).buffer());
writer.writeSpan((byte) 'h', GammaCodedSequence.generate(wa, 1, 3, 5, 8).buffer());
offset1 = writer.endRecord();
writer.beginRecord(2);
writer.writeSpan((byte) 'b', GammaCodedSequence.generate(wa, 2, 4, 6).buffer());
writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 3, 5, 7).buffer());
writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 2, 4, 6, 7).buffer());
writer.writeSpan((byte) 'p', GammaCodedSequence.generate(wa, 3, 5).buffer());
offset2 = writer.endRecord();
}
@ -47,17 +48,21 @@ class ForwardIndexSpansReaderTest {
var spans1 = reader.readSpans(arena, offset1);
var spans2 = reader.readSpans(arena, offset2);
assertEquals(1, spans1.size());
assertEquals(2, spans1.heading.size());
assertEquals('a', spans1.get(0).code());
assertEquals(IntList.of(1, 3, 5), spans1.get(0).data());
assertEquals(2, spans2.code.size());
assertEquals(2, spans2.size());
assertFalse(spans2.code.containsPosition(1));
assertTrue(spans2.code.containsPosition(3));
assertFalse(spans2.code.containsPosition(5));
assertTrue(spans2.code.containsPosition(6));
assertFalse(spans2.code.containsPosition(7));
assertFalse(spans2.code.containsPosition(8));
assertEquals('b', spans2.get(0).code());
assertEquals(IntList.of(2, 4, 6), spans2.get(0).data());
assertEquals('c', spans2.get(1).code());
assertEquals(IntList.of(3, 5, 7), spans2.get(1).data());
assertEquals(1, spans2.pre.size());
assertEquals(0, spans2.pageFooter.size());
assertFalse(spans2.pageFooter.containsPosition(8));
}
}
}

View File

@ -317,6 +317,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
try {
executeSearch();
}
catch (Exception ex) {
logger.error("Error in index lookup", ex);
}
finally {
synchronized (remainingIndexTasks) {
if (remainingIndexTasks.decrementAndGet() == 0) {

View File

@ -8,6 +8,7 @@ import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggre
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
@ -186,11 +187,17 @@ public class CombinedIndexReader {
/** Retrieves the HTML features for the specified document */
public int getHtmlFeatures(long docId) {
return forwardIndexReader.getHtmlFeatures(docId);
} /** Retrieves the HTML features for the specified document */
}
/** Retrieves the HTML features for the specified document */
public int getDocumentSize(long docId) {
return forwardIndexReader.getDocumentSize(docId);
}
/** Retrieves the document spans for the specified document */
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
return forwardIndexReader.getDocumentSpans(arena, docId);
}
/** Close the indexes (this is not done immediately)
* */

View File

@ -98,7 +98,7 @@ public class IndexResultRankingService {
}
// Calculate the preliminary score
var score = resultRanker.calculateScore(resultIds.at(i), searchTerms, flags, positions);
var score = resultRanker.calculateScore(arena, resultIds.at(i), searchTerms, flags, positions);
if (score != null) {
results.add(score);
}

View File

@ -3,15 +3,18 @@ package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CqDoubleSumOperator;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.id.UrlIdCodec;
@ -22,6 +25,7 @@ import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable;
import java.lang.foreign.Arena;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
@ -50,7 +54,8 @@ public class IndexResultScoreCalculator {
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
@Nullable
public SearchResultItem calculateScore(long combinedId,
public SearchResultItem calculateScore(Arena arena,
long combinedId,
QuerySearchTerms searchTerms,
long[] wordFlags,
CodedSequence[] positions)
@ -78,8 +83,7 @@ public class IndexResultScoreCalculator {
long docMetadata = index.getDocumentMetadata(docId);
int htmlFeatures = index.getHtmlFeatures(docId);
int docSize = index.getDocumentSize(docId);
int bestCoherence = searchTerms.coherences.testOptional(positions);
DocumentSpans spans = index.getDocumentSpans(arena, docId);
double score = calculateSearchResultValue(
wordFlagsQuery,
@ -88,7 +92,9 @@ public class IndexResultScoreCalculator {
docMetadata,
htmlFeatures,
docSize,
bestCoherence,
spans,
positions,
searchTerms.coherences,
rankingContext);
SearchResultItem searchResult = new SearchResultItem(docId,
@ -169,10 +175,13 @@ public class IndexResultScoreCalculator {
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
CompiledQueryInt positionsCountQuery,
CompiledQuery<CodedSequence> positionsQuery, long documentMetadata,
CompiledQuery<CodedSequence> positionsQuery,
long documentMetadata,
int features,
int length,
int bestCoherence,
DocumentSpans spans,
CodedSequence[] positions,
TermCoherenceGroupList coherences,
ResultRankingContext ctx)
{
if (length < 0) {
@ -205,6 +214,33 @@ public class IndexResultScoreCalculator {
temporalBias = 0;
}
int numCoherenceAll = coherences.countOptional(positions);
int bestCoherenceAll = coherences.testOptional(positions);
int bestCoherenceTitle = coherences.testOptional(positions, spans.title);
int bestCoherenceHeading = coherences.testOptional(positions, spans.heading);
double spanWeightedScore = positionsQuery.root.visit(new CqDoubleSumOperator(positionsQuery, termPos -> {
if (termPos == null)
return 0;
if (spans.title.overlapsRange(termPos))
return 5.0;
if (spans.heading.overlapsRange(termPos))
return 2.5;
if (spans.code.overlapsRange(termPos))
return 0.25;
if (spans.pre.overlapsRange(termPos))
return 0.25;
if (spans.nav.overlapsRange(termPos))
return 0.25;
if (spans.pageHeader.overlapsRange(termPos))
return 0.25;
if (spans.pageFooter.overlapsRange(termPos))
return 0.25;
return 1.0;
}));
double overallPart = averageSentenceLengthPenalty
+ documentLengthPenalty
+ qualityPenalty
@ -212,7 +248,11 @@ public class IndexResultScoreCalculator {
+ topologyBonus
+ temporalBias
+ flagsPenalty
+ bestCoherence;
+ bestCoherenceAll
+ bestCoherenceTitle
+ bestCoherenceHeading
+ numCoherenceAll / 4.
+ spanWeightedScore;
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
double tcfFirstPosition = 0.;

View File

@ -2,6 +2,7 @@ package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.ints.IntIterator;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.index.forward.spans.DocumentSpan;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.sequence.CodedSequence;
@ -40,7 +41,7 @@ public class TermCoherenceGroupList {
public int testOptional(CodedSequence[] positions) {
int best = 0;
for (var coherenceSet : mandatoryGroups) {
for (var coherenceSet : optionalGroups) {
if (coherenceSet.test(positions)) {
best = Math.max(coherenceSet.size, best);
}
@ -48,6 +49,25 @@ public class TermCoherenceGroupList {
return best;
}
public int countOptional(CodedSequence[] positions) {
int ct = 0;
for (var coherenceSet : optionalGroups) {
if (coherenceSet.test(positions)) {
ct++;
}
}
return ct;
}
public int testOptional(CodedSequence[] positions, DocumentSpan span) {
int best = 0;
for (var coherenceSet : optionalGroups) {
if (coherenceSet.test(span, positions)) {
best = Math.max(coherenceSet.size, best);
}
}
return best;
}
public static final class TermCoherenceGroup {
private final int[] offsets;
@ -92,5 +112,37 @@ public class TermCoherenceGroupList {
return SequenceOperations.intersectSequences(sequences);
}
public boolean test(DocumentSpan span, CodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {
continue;
}
int offset = offsets[oi];
if (offset < 0)
return false;
// Create iterators that are offset by their relative position in the
// sequence. This is done by subtracting the index from the offset,
// so that when we intersect them, an overlap means that the terms are
// in the correct order. Note the offset is negative!
sequences[si++] = positions[offset].offsetIterator(-oi);
}
var intersections = SequenceOperations.findIntersections(sequences);
for (int idx = 0; idx < intersections.size(); idx++) {
if (span.containsRange(intersections.getInt(idx), sequences.length)) {
return true;
}
}
return false;
}
}
}

View File

@ -9,8 +9,8 @@ import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal;

View File

@ -11,8 +11,8 @@ import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;

View File

@ -13,8 +13,8 @@ import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;

View File

@ -1,6 +1,8 @@
package nu.marginalia.sequence;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
public class SequenceOperations {
@ -30,7 +32,7 @@ public class SequenceOperations {
if (values[i] == max) {
successes++;
} else {
successes = 0;
successes = 1;
// Discard values until we reach the maximum value seen so far,
// or until the end of the sequence is reached
@ -49,6 +51,63 @@ public class SequenceOperations {
return true;
}
public static IntList findIntersections(IntIterator... sequences) {
if (sequences.length <= 1)
return IntList.of();
// Initialize values and find the maximum value
int[] values = new int[sequences.length];
for (int i = 0; i < sequences.length; i++) {
if (sequences[i].hasNext())
values[i] = sequences[i].nextInt();
else
return IntList.of();
}
// Intersect the sequences by advancing all values smaller than the maximum seen so far
// until they are equal to the maximum value, or until the end of the sequence is reached
int max = Integer.MIN_VALUE;
int successes = 0;
IntList ret = new IntArrayList();
outer:
for (int i = 0;; i = (i + 1) % sequences.length)
{
if (successes == sequences.length) {
ret.add(max);
successes = 1;
if (sequences[i].hasNext()) {
max = sequences[i].nextInt();
} else {
break;
}
} else if (values[i] == max) {
successes++;
} else {
successes = 1;
// Discard values until we reach the maximum value seen so far,
// or until the end of the sequence is reached
while (values[i] < max) {
if (sequences[i].hasNext()) {
values[i] = sequences[i].nextInt();
} else {
break outer;
}
}
// Update the maximum value, if necessary
max = Math.max(max, values[i]);
}
}
return ret;
}
/** Return the minimum word distance between two sequences, or a negative value if either sequence is empty.
* */
public static int minDistance(IntIterator seqA, IntIterator seqB)

View File

@ -162,7 +162,15 @@ public class BitReader {
}
else { // There's no more data to read!
refillCallback.run();
readNext();
if (underlying.hasRemaining()) {
readNext();
}
else {
// We've attempted to re-fill the buffer, but there's still no data to read, so we fail to avoid
// blowing up the stack with recursion
throw new IllegalStateException("No more data to read after attempted re-fill of underlying buffer");
}
}
}
}

View File

@ -1,6 +1,6 @@
package nu.marginalia.sequence;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import org.junit.jupiter.api.Test;
import java.nio.ByteBuffer;
@ -63,6 +63,17 @@ class SequenceOperationsTest {
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
}
@Test
void intersectSequencesDeepMatch3findIntersections() {
ByteBuffer wa = ByteBuffer.allocate(1024);
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 10, 11);
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14);
GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10);
assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(seq1.iterator(), seq2.iterator(), seq3.iterator()));
}
@Test
void intersectSequencesDeepMismatch() {
ByteBuffer wa = ByteBuffer.allocate(1024);

View File

@ -54,8 +54,8 @@ public class HtmlStringTagger implements NodeVisitor {
case "code" -> pushTag(HtmlTag.CODE, el);
case "title" -> pushTag(HtmlTag.TITLE, el);
case "nav" -> pushTag(HtmlTag.NAV, el);
case "header" -> pushTag(HtmlTag.HEADER, el);
case "footer" -> pushTag(HtmlTag.FOOTER, el);
case "header" -> pushTag(HtmlTag.PAGE_HEADER, el);
case "footer" -> pushTag(HtmlTag.PAGE_FOOTER, el);
case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el);
}
}

View File

@ -1,21 +1,21 @@
package nu.marginalia.language.sentence.tag;
public enum HtmlTag {
SCRIPT('s', true, false),
STYLE('S', true, false),
CODE('c', false, true),
PRE('p', false, true),
TITLE('t', false, false),
HEADING('h', false, false),
NAV('n', false, false),
HEADER('H',false, false),
FOOTER('f', false, false);
SCRIPT((byte) 's', true, false),
STYLE((byte) 'S', true, false),
CODE((byte) 'c', false, true),
PRE((byte) 'p', false, true),
TITLE((byte) 't', false, false),
HEADING((byte) 'h', false, false),
NAV((byte) 'n', false, false),
PAGE_HEADER((byte) 'H',false, false),
PAGE_FOOTER((byte) 'f', false, false);
public char code;
public byte code;
public boolean exclude;
public boolean nonLanguage;
HtmlTag(char code, boolean exclude, boolean nonLanguage) {
HtmlTag(byte code, boolean exclude, boolean nonLanguage) {
this.code = code;
this.exclude = exclude;
this.nonLanguage = nonLanguage;

View File

@ -144,7 +144,7 @@ public class DocumentKeywordsBuilder {
public void addSpans(List<DocumentWordSpan> newSpans) {
for (var span : newSpans) {
wordSpans.computeIfAbsent(span.tag().code, k -> new ArrayList<>()).add(span);
wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span);
}
}

View File

@ -9,8 +9,8 @@ import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.id.UrlIdCodec;

View File

@ -18,8 +18,8 @@ import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.model.SearchParameters;