mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Experimental initial integration of document spans into index
This commit is contained in:
parent
80900107f7
commit
b316b55be9
@ -59,13 +59,4 @@ public class CompiledQueryAggregates {
|
|||||||
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
|
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
|
|
||||||
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
|
||||||
return query.root().visit(new CqPositionsOperator(query, operator));
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
|
|
||||||
public static <T> LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) {
|
|
||||||
return query.root().visit(new CqPositionsOperator(query, operator));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,85 +0,0 @@
|
|||||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.function.IntToLongFunction;
|
|
||||||
import java.util.function.LongUnaryOperator;
|
|
||||||
import java.util.function.ToLongFunction;
|
|
||||||
|
|
||||||
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
|
|
||||||
private final IntToLongFunction operator;
|
|
||||||
|
|
||||||
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
|
||||||
this.operator = idx -> operator.applyAsLong(query.at(idx));
|
|
||||||
}
|
|
||||||
|
|
||||||
public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) {
|
|
||||||
this.operator = idx -> operator.applyAsLong(query.at(idx));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public LongSet onAnd(List<? extends CqExpression> parts) {
|
|
||||||
LongSet ret = new LongArraySet();
|
|
||||||
|
|
||||||
for (var part : parts) {
|
|
||||||
ret = comineSets(ret, part.visit(this));
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
private LongSet comineSets(LongSet a, LongSet b) {
|
|
||||||
if (a.isEmpty())
|
|
||||||
return b;
|
|
||||||
if (b.isEmpty())
|
|
||||||
return a;
|
|
||||||
|
|
||||||
LongSet ret = newSet(a.size() * b.size());
|
|
||||||
|
|
||||||
var ai = a.longIterator();
|
|
||||||
|
|
||||||
while (ai.hasNext()) {
|
|
||||||
long aval = ai.nextLong();
|
|
||||||
|
|
||||||
var bi = b.longIterator();
|
|
||||||
while (bi.hasNext()) {
|
|
||||||
ret.add(aval & bi.nextLong());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public LongSet onOr(List<? extends CqExpression> parts) {
|
|
||||||
LongSet ret = newSet(parts.size());
|
|
||||||
|
|
||||||
for (var part : parts) {
|
|
||||||
ret.addAll(part.visit(this));
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public LongSet onLeaf(int idx) {
|
|
||||||
var set = newSet(1);
|
|
||||||
set.add(operator.applyAsLong(idx));
|
|
||||||
return set;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Allocate a new set suitable for a collection with the provided cardinality */
|
|
||||||
private LongSet newSet(int cardinality) {
|
|
||||||
if (cardinality < 8)
|
|
||||||
return new LongArraySet(cardinality);
|
|
||||||
else
|
|
||||||
return new LongOpenHashSet(cardinality);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -17,6 +17,7 @@ dependencies {
|
|||||||
implementation project(':code:libraries:btree')
|
implementation project(':code:libraries:btree')
|
||||||
implementation project(':code:libraries:slop')
|
implementation project(':code:libraries:slop')
|
||||||
implementation project(':code:libraries:coded-sequence')
|
implementation project(':code:libraries:coded-sequence')
|
||||||
|
implementation project(':code:libraries:language-processing')
|
||||||
implementation project(':code:index:query')
|
implementation project(':code:index:query')
|
||||||
implementation project(':code:index:index-journal')
|
implementation project(':code:index:index-journal')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
class ForwardIndexParameters {
|
public class ForwardIndexParameters {
|
||||||
public static final int ENTRY_SIZE = 3;
|
public static final int ENTRY_SIZE = 3;
|
||||||
public static final int METADATA_OFFSET = 0;
|
public static final int METADATA_OFFSET = 0;
|
||||||
public static final int FEATURES_OFFSET = 1;
|
public static final int FEATURES_OFFSET = 1;
|
||||||
|
@ -3,11 +3,14 @@ package nu.marginalia.index.forward;
|
|||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
import gnu.trove.map.hash.TLongIntHashMap;
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
|
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
@ -30,6 +33,7 @@ public class ForwardIndexReader {
|
|||||||
private final LongArray data;
|
private final LongArray data;
|
||||||
|
|
||||||
private final ForwardIndexSpansReader spansReader;
|
private final ForwardIndexSpansReader spansReader;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public ForwardIndexReader(Path idsFile,
|
public ForwardIndexReader(Path idsFile,
|
||||||
@ -121,6 +125,21 @@ public class ForwardIndexReader {
|
|||||||
return idToOffset.get(docId);
|
return idToOffset.get(docId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
||||||
|
long offset = idxForDoc(docId);
|
||||||
|
if (offset < 0) return new DocumentSpans();
|
||||||
|
|
||||||
|
long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return spansReader.readSpans(arena, encodedOffset);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
logger.error("Failed to read spans for doc " + docId, ex);
|
||||||
|
return new DocumentSpans();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public int totalDocCount() {
|
public int totalDocCount() {
|
||||||
return idToOffset.size();
|
return idToOffset.size();
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward.construction;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
|
import nu.marginalia.index.forward.ForwardIndexParameters;
|
||||||
|
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
@ -0,0 +1,77 @@
|
|||||||
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
|
import nu.marginalia.sequence.SequenceOperations;
|
||||||
|
|
||||||
|
public class DocumentSpan {
|
||||||
|
|
||||||
|
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||||
|
private final CodedSequence startsEnds;
|
||||||
|
|
||||||
|
public DocumentSpan(CodedSequence startsEnds) {
|
||||||
|
this.startsEnds = startsEnds;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentSpan() {
|
||||||
|
this.startsEnds = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean containsPosition(int position) {
|
||||||
|
if (startsEnds == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var iter = startsEnds.iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
int start = iter.nextInt();
|
||||||
|
if (start > position) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int end = iter.nextInt();
|
||||||
|
if (end > position) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean containsRange(int rangeStart, int len) {
|
||||||
|
if (startsEnds == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var iter = startsEnds.iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
int start = iter.nextInt();
|
||||||
|
if (start > rangeStart) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int end = iter.nextInt();
|
||||||
|
if (end > rangeStart + len) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean overlapsRange(CodedSequence sequence) {
|
||||||
|
return SequenceOperations.intersectSequences(iterator(), sequence.iterator());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns an iterator over the start and end positions of each span in the document of this type */
|
||||||
|
public IntIterator iterator() {
|
||||||
|
if (null == startsEnds) {
|
||||||
|
return IntList.of().iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
return startsEnds.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return startsEnds.valueCount() / 2;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,35 @@
|
|||||||
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
|
|
||||||
|
public class DocumentSpans {
|
||||||
|
private static DocumentSpan EMPTY_SPAN = new DocumentSpan();
|
||||||
|
|
||||||
|
public DocumentSpan title = EMPTY_SPAN;
|
||||||
|
public DocumentSpan heading = EMPTY_SPAN;
|
||||||
|
|
||||||
|
public DocumentSpan nav = EMPTY_SPAN;
|
||||||
|
public DocumentSpan pageHeader = EMPTY_SPAN;
|
||||||
|
public DocumentSpan pageFooter = EMPTY_SPAN;
|
||||||
|
public DocumentSpan code = EMPTY_SPAN;
|
||||||
|
public DocumentSpan pre = EMPTY_SPAN;
|
||||||
|
|
||||||
|
void accept(byte code, CodedSequence positions) {
|
||||||
|
if (code == HtmlTag.HEADING.code)
|
||||||
|
this.heading = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.TITLE.code)
|
||||||
|
this.title = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.NAV.code)
|
||||||
|
this.nav = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.PAGE_HEADER.code)
|
||||||
|
this.pageHeader = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.PAGE_FOOTER.code)
|
||||||
|
this.pageFooter = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.CODE.code)
|
||||||
|
this.code = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.PRE.code)
|
||||||
|
this.pre = new DocumentSpan(positions);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,6 +1,5 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -9,8 +8,6 @@ import java.nio.channels.FileChannel;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@SuppressWarnings("preview")
|
@SuppressWarnings("preview")
|
||||||
public class ForwardIndexSpansReader implements AutoCloseable {
|
public class ForwardIndexSpansReader implements AutoCloseable {
|
||||||
@ -20,9 +17,9 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
|||||||
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<SpanData> readSpans(Arena arena, long encodedOffset) throws IOException {
|
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||||
long size = encodedOffset & 0xFFF_FFFF;
|
long size = SpansCodec.decodeSize(encodedOffset);
|
||||||
long offset = encodedOffset >>> 28;
|
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
||||||
|
|
||||||
var buffer = arena.allocate(size).asByteBuffer();
|
var buffer = arena.allocate(size).asByteBuffer();
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
@ -33,22 +30,16 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
|||||||
|
|
||||||
int count = buffer.get();
|
int count = buffer.get();
|
||||||
|
|
||||||
List<SpanData> ret = new ArrayList<>();
|
DocumentSpans ret = new DocumentSpans();
|
||||||
|
|
||||||
while (count-- > 0) {
|
while (count-- > 0) {
|
||||||
byte code = buffer.get();
|
byte code = buffer.get();
|
||||||
short len = buffer.getShort();
|
short len = buffer.getShort();
|
||||||
|
|
||||||
final int pos = buffer.position();
|
ret.accept(code, new GammaCodedSequence(buffer.slice(buffer.position(), len)));
|
||||||
|
|
||||||
// Decode the gamma-coded sequence; this will advance the buffer position
|
|
||||||
// in a not entirely predictable way, so we need to save the position
|
|
||||||
buffer.limit(buffer.position() + len);
|
|
||||||
var sequence = new GammaCodedSequence(buffer).values();
|
|
||||||
ret.add(new SpanData(code, sequence));
|
|
||||||
|
|
||||||
// Reset the buffer position to the end of the span
|
// Reset the buffer position to the end of the span
|
||||||
buffer.position(pos + len);
|
buffer.position(buffer.position() + len);
|
||||||
buffer.limit(buffer.capacity());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
@ -59,5 +50,4 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
|||||||
spansFileChannel.close();
|
spansFileChannel.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public record SpanData(byte code, IntList data) {}
|
|
||||||
}
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
@ -42,8 +42,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public long endRecord() {
|
public long endRecord() {
|
||||||
return stateStartOffset << 28 | stateLength;
|
return SpansCodec.encode(stateStartOffset, stateLength);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
@ -0,0 +1,17 @@
|
|||||||
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
public class SpansCodec {
|
||||||
|
public static long encode(long startOffset, long size) {
|
||||||
|
assert size < 0x1000_0000L : "Size must be less than 2^28";
|
||||||
|
|
||||||
|
return startOffset << 28 | (size & 0xFFF_FFFFL);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long decodeStartOffset(long encoded) {
|
||||||
|
return encoded >>> 28;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long decodeSize(long encoded) {
|
||||||
|
return encoded & 0x0FFF_FFFFL;
|
||||||
|
}
|
||||||
|
}
|
@ -2,6 +2,7 @@ package nu.marginalia.index.forward;
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
|
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
|
||||||
|
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -11,7 +12,7 @@ import java.nio.ByteBuffer;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
class ForwardIndexSpansReaderTest {
|
class ForwardIndexSpansReaderTest {
|
||||||
Path testFile = Files.createTempFile("test", ".idx");
|
Path testFile = Files.createTempFile("test", ".idx");
|
||||||
@ -32,12 +33,12 @@ class ForwardIndexSpansReaderTest {
|
|||||||
long offset2;
|
long offset2;
|
||||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
||||||
writer.beginRecord(1);
|
writer.beginRecord(1);
|
||||||
writer.writeSpan((byte) 'a', GammaCodedSequence.generate(wa, 1, 3, 5).buffer());
|
writer.writeSpan((byte) 'h', GammaCodedSequence.generate(wa, 1, 3, 5, 8).buffer());
|
||||||
offset1 = writer.endRecord();
|
offset1 = writer.endRecord();
|
||||||
|
|
||||||
writer.beginRecord(2);
|
writer.beginRecord(2);
|
||||||
writer.writeSpan((byte) 'b', GammaCodedSequence.generate(wa, 2, 4, 6).buffer());
|
writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 2, 4, 6, 7).buffer());
|
||||||
writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 3, 5, 7).buffer());
|
writer.writeSpan((byte) 'p', GammaCodedSequence.generate(wa, 3, 5).buffer());
|
||||||
offset2 = writer.endRecord();
|
offset2 = writer.endRecord();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -47,17 +48,21 @@ class ForwardIndexSpansReaderTest {
|
|||||||
var spans1 = reader.readSpans(arena, offset1);
|
var spans1 = reader.readSpans(arena, offset1);
|
||||||
var spans2 = reader.readSpans(arena, offset2);
|
var spans2 = reader.readSpans(arena, offset2);
|
||||||
|
|
||||||
assertEquals(1, spans1.size());
|
assertEquals(2, spans1.heading.size());
|
||||||
|
|
||||||
assertEquals('a', spans1.get(0).code());
|
assertEquals(2, spans2.code.size());
|
||||||
assertEquals(IntList.of(1, 3, 5), spans1.get(0).data());
|
|
||||||
|
|
||||||
assertEquals(2, spans2.size());
|
assertFalse(spans2.code.containsPosition(1));
|
||||||
|
assertTrue(spans2.code.containsPosition(3));
|
||||||
|
assertFalse(spans2.code.containsPosition(5));
|
||||||
|
assertTrue(spans2.code.containsPosition(6));
|
||||||
|
assertFalse(spans2.code.containsPosition(7));
|
||||||
|
assertFalse(spans2.code.containsPosition(8));
|
||||||
|
|
||||||
assertEquals('b', spans2.get(0).code());
|
assertEquals(1, spans2.pre.size());
|
||||||
assertEquals(IntList.of(2, 4, 6), spans2.get(0).data());
|
|
||||||
assertEquals('c', spans2.get(1).code());
|
assertEquals(0, spans2.pageFooter.size());
|
||||||
assertEquals(IntList.of(3, 5, 7), spans2.get(1).data());
|
assertFalse(spans2.pageFooter.containsPosition(8));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -317,6 +317,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
try {
|
try {
|
||||||
executeSearch();
|
executeSearch();
|
||||||
}
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Error in index lookup", ex);
|
||||||
|
}
|
||||||
finally {
|
finally {
|
||||||
synchronized (remainingIndexTasks) {
|
synchronized (remainingIndexTasks) {
|
||||||
if (remainingIndexTasks.decrementAndGet() == 0) {
|
if (remainingIndexTasks.decrementAndGet() == 0) {
|
||||||
|
@ -8,6 +8,7 @@ import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggre
|
|||||||
import nu.marginalia.index.FullReverseIndexReader;
|
import nu.marginalia.index.FullReverseIndexReader;
|
||||||
import nu.marginalia.index.PrioReverseIndexReader;
|
import nu.marginalia.index.PrioReverseIndexReader;
|
||||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||||
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
import nu.marginalia.index.model.QueryParams;
|
import nu.marginalia.index.model.QueryParams;
|
||||||
import nu.marginalia.index.model.SearchTerms;
|
import nu.marginalia.index.model.SearchTerms;
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
@ -186,11 +187,17 @@ public class CombinedIndexReader {
|
|||||||
/** Retrieves the HTML features for the specified document */
|
/** Retrieves the HTML features for the specified document */
|
||||||
public int getHtmlFeatures(long docId) {
|
public int getHtmlFeatures(long docId) {
|
||||||
return forwardIndexReader.getHtmlFeatures(docId);
|
return forwardIndexReader.getHtmlFeatures(docId);
|
||||||
} /** Retrieves the HTML features for the specified document */
|
}
|
||||||
|
|
||||||
|
/** Retrieves the HTML features for the specified document */
|
||||||
public int getDocumentSize(long docId) {
|
public int getDocumentSize(long docId) {
|
||||||
return forwardIndexReader.getDocumentSize(docId);
|
return forwardIndexReader.getDocumentSize(docId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Retrieves the document spans for the specified document */
|
||||||
|
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
||||||
|
return forwardIndexReader.getDocumentSpans(arena, docId);
|
||||||
|
}
|
||||||
|
|
||||||
/** Close the indexes (this is not done immediately)
|
/** Close the indexes (this is not done immediately)
|
||||||
* */
|
* */
|
||||||
|
@ -98,7 +98,7 @@ public class IndexResultRankingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the preliminary score
|
// Calculate the preliminary score
|
||||||
var score = resultRanker.calculateScore(resultIds.at(i), searchTerms, flags, positions);
|
var score = resultRanker.calculateScore(arena, resultIds.at(i), searchTerms, flags, positions);
|
||||||
if (score != null) {
|
if (score != null) {
|
||||||
results.add(score);
|
results.add(score);
|
||||||
}
|
}
|
||||||
|
@ -3,15 +3,18 @@ package nu.marginalia.index.results;
|
|||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CqDoubleSumOperator;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
import nu.marginalia.index.index.CombinedIndexReader;
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.QueryParams;
|
import nu.marginalia.index.model.QueryParams;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||||
|
import nu.marginalia.index.results.model.TermCoherenceGroupList;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
@ -22,6 +25,7 @@ import nu.marginalia.sequence.CodedSequence;
|
|||||||
import nu.marginalia.sequence.SequenceOperations;
|
import nu.marginalia.sequence.SequenceOperations;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
|
|
||||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
||||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
|
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
|
||||||
@ -50,7 +54,8 @@ public class IndexResultScoreCalculator {
|
|||||||
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public SearchResultItem calculateScore(long combinedId,
|
public SearchResultItem calculateScore(Arena arena,
|
||||||
|
long combinedId,
|
||||||
QuerySearchTerms searchTerms,
|
QuerySearchTerms searchTerms,
|
||||||
long[] wordFlags,
|
long[] wordFlags,
|
||||||
CodedSequence[] positions)
|
CodedSequence[] positions)
|
||||||
@ -78,8 +83,7 @@ public class IndexResultScoreCalculator {
|
|||||||
long docMetadata = index.getDocumentMetadata(docId);
|
long docMetadata = index.getDocumentMetadata(docId);
|
||||||
int htmlFeatures = index.getHtmlFeatures(docId);
|
int htmlFeatures = index.getHtmlFeatures(docId);
|
||||||
int docSize = index.getDocumentSize(docId);
|
int docSize = index.getDocumentSize(docId);
|
||||||
|
DocumentSpans spans = index.getDocumentSpans(arena, docId);
|
||||||
int bestCoherence = searchTerms.coherences.testOptional(positions);
|
|
||||||
|
|
||||||
double score = calculateSearchResultValue(
|
double score = calculateSearchResultValue(
|
||||||
wordFlagsQuery,
|
wordFlagsQuery,
|
||||||
@ -88,7 +92,9 @@ public class IndexResultScoreCalculator {
|
|||||||
docMetadata,
|
docMetadata,
|
||||||
htmlFeatures,
|
htmlFeatures,
|
||||||
docSize,
|
docSize,
|
||||||
bestCoherence,
|
spans,
|
||||||
|
positions,
|
||||||
|
searchTerms.coherences,
|
||||||
rankingContext);
|
rankingContext);
|
||||||
|
|
||||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||||
@ -169,10 +175,13 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
|
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
|
||||||
CompiledQueryInt positionsCountQuery,
|
CompiledQueryInt positionsCountQuery,
|
||||||
CompiledQuery<CodedSequence> positionsQuery, long documentMetadata,
|
CompiledQuery<CodedSequence> positionsQuery,
|
||||||
|
long documentMetadata,
|
||||||
int features,
|
int features,
|
||||||
int length,
|
int length,
|
||||||
int bestCoherence,
|
DocumentSpans spans,
|
||||||
|
CodedSequence[] positions,
|
||||||
|
TermCoherenceGroupList coherences,
|
||||||
ResultRankingContext ctx)
|
ResultRankingContext ctx)
|
||||||
{
|
{
|
||||||
if (length < 0) {
|
if (length < 0) {
|
||||||
@ -205,6 +214,33 @@ public class IndexResultScoreCalculator {
|
|||||||
temporalBias = 0;
|
temporalBias = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int numCoherenceAll = coherences.countOptional(positions);
|
||||||
|
int bestCoherenceAll = coherences.testOptional(positions);
|
||||||
|
int bestCoherenceTitle = coherences.testOptional(positions, spans.title);
|
||||||
|
int bestCoherenceHeading = coherences.testOptional(positions, spans.heading);
|
||||||
|
|
||||||
|
double spanWeightedScore = positionsQuery.root.visit(new CqDoubleSumOperator(positionsQuery, termPos -> {
|
||||||
|
if (termPos == null)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (spans.title.overlapsRange(termPos))
|
||||||
|
return 5.0;
|
||||||
|
if (spans.heading.overlapsRange(termPos))
|
||||||
|
return 2.5;
|
||||||
|
if (spans.code.overlapsRange(termPos))
|
||||||
|
return 0.25;
|
||||||
|
if (spans.pre.overlapsRange(termPos))
|
||||||
|
return 0.25;
|
||||||
|
if (spans.nav.overlapsRange(termPos))
|
||||||
|
return 0.25;
|
||||||
|
if (spans.pageHeader.overlapsRange(termPos))
|
||||||
|
return 0.25;
|
||||||
|
if (spans.pageFooter.overlapsRange(termPos))
|
||||||
|
return 0.25;
|
||||||
|
return 1.0;
|
||||||
|
}));
|
||||||
|
|
||||||
double overallPart = averageSentenceLengthPenalty
|
double overallPart = averageSentenceLengthPenalty
|
||||||
+ documentLengthPenalty
|
+ documentLengthPenalty
|
||||||
+ qualityPenalty
|
+ qualityPenalty
|
||||||
@ -212,7 +248,11 @@ public class IndexResultScoreCalculator {
|
|||||||
+ topologyBonus
|
+ topologyBonus
|
||||||
+ temporalBias
|
+ temporalBias
|
||||||
+ flagsPenalty
|
+ flagsPenalty
|
||||||
+ bestCoherence;
|
+ bestCoherenceAll
|
||||||
|
+ bestCoherenceTitle
|
||||||
|
+ bestCoherenceHeading
|
||||||
|
+ numCoherenceAll / 4.
|
||||||
|
+ spanWeightedScore;
|
||||||
|
|
||||||
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
|
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
|
||||||
double tcfFirstPosition = 0.;
|
double tcfFirstPosition = 0.;
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.index.results.model;
|
|||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||||
|
import nu.marginalia.index.forward.spans.DocumentSpan;
|
||||||
import nu.marginalia.index.model.SearchTermsUtil;
|
import nu.marginalia.index.model.SearchTermsUtil;
|
||||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||||
import nu.marginalia.sequence.CodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
@ -40,7 +41,7 @@ public class TermCoherenceGroupList {
|
|||||||
|
|
||||||
public int testOptional(CodedSequence[] positions) {
|
public int testOptional(CodedSequence[] positions) {
|
||||||
int best = 0;
|
int best = 0;
|
||||||
for (var coherenceSet : mandatoryGroups) {
|
for (var coherenceSet : optionalGroups) {
|
||||||
if (coherenceSet.test(positions)) {
|
if (coherenceSet.test(positions)) {
|
||||||
best = Math.max(coherenceSet.size, best);
|
best = Math.max(coherenceSet.size, best);
|
||||||
}
|
}
|
||||||
@ -48,6 +49,25 @@ public class TermCoherenceGroupList {
|
|||||||
return best;
|
return best;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int countOptional(CodedSequence[] positions) {
|
||||||
|
int ct = 0;
|
||||||
|
for (var coherenceSet : optionalGroups) {
|
||||||
|
if (coherenceSet.test(positions)) {
|
||||||
|
ct++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ct;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int testOptional(CodedSequence[] positions, DocumentSpan span) {
|
||||||
|
int best = 0;
|
||||||
|
for (var coherenceSet : optionalGroups) {
|
||||||
|
if (coherenceSet.test(span, positions)) {
|
||||||
|
best = Math.max(coherenceSet.size, best);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
public static final class TermCoherenceGroup {
|
public static final class TermCoherenceGroup {
|
||||||
private final int[] offsets;
|
private final int[] offsets;
|
||||||
@ -92,5 +112,37 @@ public class TermCoherenceGroupList {
|
|||||||
|
|
||||||
return SequenceOperations.intersectSequences(sequences);
|
return SequenceOperations.intersectSequences(sequences);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean test(DocumentSpan span, CodedSequence[] positions) {
|
||||||
|
IntIterator[] sequences = new IntIterator[present.cardinality()];
|
||||||
|
|
||||||
|
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
||||||
|
if (!present.get(oi)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int offset = offsets[oi];
|
||||||
|
if (offset < 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Create iterators that are offset by their relative position in the
|
||||||
|
// sequence. This is done by subtracting the index from the offset,
|
||||||
|
// so that when we intersect them, an overlap means that the terms are
|
||||||
|
// in the correct order. Note the offset is negative!
|
||||||
|
|
||||||
|
sequences[si++] = positions[offset].offsetIterator(-oi);
|
||||||
|
}
|
||||||
|
|
||||||
|
var intersections = SequenceOperations.findIntersections(sequences);
|
||||||
|
|
||||||
|
for (int idx = 0; idx < intersections.size(); idx++) {
|
||||||
|
if (span.containsRange(intersections.getInt(idx), sequences.length)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,8 +9,8 @@ import nu.marginalia.hash.MurmurHash3_128;
|
|||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
|
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.index.CombinedIndexReader;
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
|
@ -11,8 +11,8 @@ import nu.marginalia.index.construction.DocIdRewriter;
|
|||||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
|
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||||
|
@ -13,8 +13,8 @@ import nu.marginalia.index.construction.DocIdRewriter;
|
|||||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
|
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.sequence;
|
package nu.marginalia.sequence;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
|
||||||
public class SequenceOperations {
|
public class SequenceOperations {
|
||||||
|
|
||||||
@ -30,7 +32,7 @@ public class SequenceOperations {
|
|||||||
if (values[i] == max) {
|
if (values[i] == max) {
|
||||||
successes++;
|
successes++;
|
||||||
} else {
|
} else {
|
||||||
successes = 0;
|
successes = 1;
|
||||||
|
|
||||||
// Discard values until we reach the maximum value seen so far,
|
// Discard values until we reach the maximum value seen so far,
|
||||||
// or until the end of the sequence is reached
|
// or until the end of the sequence is reached
|
||||||
@ -49,6 +51,63 @@ public class SequenceOperations {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static IntList findIntersections(IntIterator... sequences) {
|
||||||
|
|
||||||
|
if (sequences.length <= 1)
|
||||||
|
return IntList.of();
|
||||||
|
|
||||||
|
// Initialize values and find the maximum value
|
||||||
|
int[] values = new int[sequences.length];
|
||||||
|
|
||||||
|
for (int i = 0; i < sequences.length; i++) {
|
||||||
|
if (sequences[i].hasNext())
|
||||||
|
values[i] = sequences[i].nextInt();
|
||||||
|
else
|
||||||
|
return IntList.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Intersect the sequences by advancing all values smaller than the maximum seen so far
|
||||||
|
// until they are equal to the maximum value, or until the end of the sequence is reached
|
||||||
|
int max = Integer.MIN_VALUE;
|
||||||
|
int successes = 0;
|
||||||
|
|
||||||
|
IntList ret = new IntArrayList();
|
||||||
|
|
||||||
|
outer:
|
||||||
|
for (int i = 0;; i = (i + 1) % sequences.length)
|
||||||
|
{
|
||||||
|
if (successes == sequences.length) {
|
||||||
|
ret.add(max);
|
||||||
|
successes = 1;
|
||||||
|
|
||||||
|
if (sequences[i].hasNext()) {
|
||||||
|
max = sequences[i].nextInt();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else if (values[i] == max) {
|
||||||
|
successes++;
|
||||||
|
} else {
|
||||||
|
successes = 1;
|
||||||
|
|
||||||
|
// Discard values until we reach the maximum value seen so far,
|
||||||
|
// or until the end of the sequence is reached
|
||||||
|
while (values[i] < max) {
|
||||||
|
if (sequences[i].hasNext()) {
|
||||||
|
values[i] = sequences[i].nextInt();
|
||||||
|
} else {
|
||||||
|
break outer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the maximum value, if necessary
|
||||||
|
max = Math.max(max, values[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/** Return the minimum word distance between two sequences, or a negative value if either sequence is empty.
|
/** Return the minimum word distance between two sequences, or a negative value if either sequence is empty.
|
||||||
* */
|
* */
|
||||||
public static int minDistance(IntIterator seqA, IntIterator seqB)
|
public static int minDistance(IntIterator seqA, IntIterator seqB)
|
||||||
|
@ -162,7 +162,15 @@ public class BitReader {
|
|||||||
}
|
}
|
||||||
else { // There's no more data to read!
|
else { // There's no more data to read!
|
||||||
refillCallback.run();
|
refillCallback.run();
|
||||||
readNext();
|
if (underlying.hasRemaining()) {
|
||||||
|
readNext();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// We've attempted to re-fill the buffer, but there's still no data to read, so we fail to avoid
|
||||||
|
// blowing up the stack with recursion
|
||||||
|
throw new IllegalStateException("No more data to read after attempted re-fill of underlying buffer");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.sequence;
|
package nu.marginalia.sequence;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
@ -63,6 +63,17 @@ class SequenceOperationsTest {
|
|||||||
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void intersectSequencesDeepMatch3findIntersections() {
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 10, 11);
|
||||||
|
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14);
|
||||||
|
GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10);
|
||||||
|
|
||||||
|
assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void intersectSequencesDeepMismatch() {
|
void intersectSequencesDeepMismatch() {
|
||||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
@ -54,8 +54,8 @@ public class HtmlStringTagger implements NodeVisitor {
|
|||||||
case "code" -> pushTag(HtmlTag.CODE, el);
|
case "code" -> pushTag(HtmlTag.CODE, el);
|
||||||
case "title" -> pushTag(HtmlTag.TITLE, el);
|
case "title" -> pushTag(HtmlTag.TITLE, el);
|
||||||
case "nav" -> pushTag(HtmlTag.NAV, el);
|
case "nav" -> pushTag(HtmlTag.NAV, el);
|
||||||
case "header" -> pushTag(HtmlTag.HEADER, el);
|
case "header" -> pushTag(HtmlTag.PAGE_HEADER, el);
|
||||||
case "footer" -> pushTag(HtmlTag.FOOTER, el);
|
case "footer" -> pushTag(HtmlTag.PAGE_FOOTER, el);
|
||||||
case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el);
|
case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,21 +1,21 @@
|
|||||||
package nu.marginalia.language.sentence.tag;
|
package nu.marginalia.language.sentence.tag;
|
||||||
|
|
||||||
public enum HtmlTag {
|
public enum HtmlTag {
|
||||||
SCRIPT('s', true, false),
|
SCRIPT((byte) 's', true, false),
|
||||||
STYLE('S', true, false),
|
STYLE((byte) 'S', true, false),
|
||||||
CODE('c', false, true),
|
CODE((byte) 'c', false, true),
|
||||||
PRE('p', false, true),
|
PRE((byte) 'p', false, true),
|
||||||
TITLE('t', false, false),
|
TITLE((byte) 't', false, false),
|
||||||
HEADING('h', false, false),
|
HEADING((byte) 'h', false, false),
|
||||||
NAV('n', false, false),
|
NAV((byte) 'n', false, false),
|
||||||
HEADER('H',false, false),
|
PAGE_HEADER((byte) 'H',false, false),
|
||||||
FOOTER('f', false, false);
|
PAGE_FOOTER((byte) 'f', false, false);
|
||||||
|
|
||||||
public char code;
|
public byte code;
|
||||||
public boolean exclude;
|
public boolean exclude;
|
||||||
public boolean nonLanguage;
|
public boolean nonLanguage;
|
||||||
|
|
||||||
HtmlTag(char code, boolean exclude, boolean nonLanguage) {
|
HtmlTag(byte code, boolean exclude, boolean nonLanguage) {
|
||||||
this.code = code;
|
this.code = code;
|
||||||
this.exclude = exclude;
|
this.exclude = exclude;
|
||||||
this.nonLanguage = nonLanguage;
|
this.nonLanguage = nonLanguage;
|
||||||
|
@ -144,7 +144,7 @@ public class DocumentKeywordsBuilder {
|
|||||||
|
|
||||||
public void addSpans(List<DocumentWordSpan> newSpans) {
|
public void addSpans(List<DocumentWordSpan> newSpans) {
|
||||||
for (var span : newSpans) {
|
for (var span : newSpans) {
|
||||||
wordSpans.computeIfAbsent(span.tag().code, k -> new ArrayList<>()).add(span);
|
wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,8 +9,8 @@ import nu.marginalia.ProcessConfigurationModule;
|
|||||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
|
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
@ -18,8 +18,8 @@ import nu.marginalia.index.ReverseIndexPrioFileNames;
|
|||||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
|
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
|
Loading…
Reference in New Issue
Block a user