(coded-sequence) Replace GCS usage with an interface

This commit is contained in:
Viktor Lofgren 2024-07-16 14:37:50 +02:00
parent 5c098005cc
commit 0b31c4cfbb
21 changed files with 68 additions and 39 deletions

View File

@ -1,6 +1,6 @@
package nu.marginalia.keyword.model; package nu.marginalia.keyword.model;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
import java.io.Serial; import java.io.Serial;
import java.io.Serializable; import java.io.Serializable;
@ -12,11 +12,11 @@ public final class DocumentKeywords implements Serializable {
public final String[] keywords; public final String[] keywords;
public final long[] metadata; public final long[] metadata;
public final GammaCodedSequence[] positions; public final CodedSequence[] positions;
public DocumentKeywords(String[] keywords, public DocumentKeywords(String[] keywords,
long[] metadata, long[] metadata,
GammaCodedSequence[] positions) CodedSequence[] positions)
{ {
this.keywords = keywords; this.keywords = keywords;
this.metadata = metadata; this.metadata = metadata;

View File

@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter; import lombok.Getter;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -36,7 +37,7 @@ public class DocumentKeywordsBuilder {
public DocumentKeywords build(ByteBuffer workArea) { public DocumentKeywords build(ByteBuffer workArea) {
final String[] wordArray = new String[wordToMeta.size()]; final String[] wordArray = new String[wordToMeta.size()];
final long[] meta = new long[wordToMeta.size()]; final long[] meta = new long[wordToMeta.size()];
final GammaCodedSequence[] positions = new GammaCodedSequence[wordToMeta.size()]; final CodedSequence[] positions = new CodedSequence[wordToMeta.size()];
var iter = wordToMeta.object2LongEntrySet().fastIterator(); var iter = wordToMeta.object2LongEntrySet().fastIterator();

View File

@ -5,6 +5,7 @@ import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
@ -93,7 +94,7 @@ class DocumentKeywordExtractorTest {
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024)); var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
Map<String, WordMetadata> flags = new HashMap<>(); Map<String, WordMetadata> flags = new HashMap<>();
Map<String, GammaCodedSequence> positions = new HashMap<>(); Map<String, CodedSequence> positions = new HashMap<>();
for (int i = 0; i < keywordsBuilt.size(); i++) { for (int i = 0; i < keywordsBuilt.size(); i++) {
String keyword = keywordsBuilt.keywords[i]; String keyword = keywordsBuilt.keywords[i];

View File

@ -9,6 +9,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.test.TestUtil; import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
@ -84,7 +85,7 @@ class ForwardIndexConverterTest {
new IndexJournalEntryData( new IndexJournalEntryData(
new String[]{}, new String[]{},
new long[]{}, new long[]{},
new GammaCodedSequence[]{} new CodedSequence[]{}
) )
); );
} }

View File

@ -1,11 +1,11 @@
package nu.marginalia.index.journal.model; package nu.marginalia.index.journal.model;
import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
public record IndexJournalEntryData(long[] termIds, public record IndexJournalEntryData(long[] termIds,
long[] metadata, long[] metadata,
GammaCodedSequence[] positions) { CodedSequence[] positions) {
public IndexJournalEntryData { public IndexJournalEntryData {
assert termIds.length == metadata.length; assert termIds.length == metadata.length;
@ -14,7 +14,7 @@ public record IndexJournalEntryData(long[] termIds,
public IndexJournalEntryData(String[] keywords, public IndexJournalEntryData(String[] keywords,
long[] metadata, long[] metadata,
GammaCodedSequence[] positions) CodedSequence[] positions)
{ {
this(termIds(keywords), metadata, positions); this(termIds(keywords), metadata, positions);
} }

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.journal.model; package nu.marginalia.index.journal.model;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
@ -17,7 +18,7 @@ public record IndexJournalEntryTermData(
long metadata, long metadata,
ByteBuffer positionsBuffer) ByteBuffer positionsBuffer)
{ {
public GammaCodedSequence positions() { public CodedSequence positions() {
return new GammaCodedSequence(positionsBuffer); return new GammaCodedSequence(positionsBuffer);
} }

View File

@ -5,7 +5,7 @@ import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -81,7 +81,7 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
{ {
final long[] keywords = data.termIds(); final long[] keywords = data.termIds();
final long[] metadata = data.metadata(); final long[] metadata = data.metadata();
final GammaCodedSequence[] positions = data.positions(); final CodedSequence[] positions = data.positions();
int entrySize = 0; int entrySize = 0;
for (var position : positions) { for (var position : positions) {

View File

@ -1,7 +1,6 @@
package nu.marginalia.index.construction; package nu.marginalia.index.construction;
import nu.marginalia.index.positions.PositionCodec; import nu.marginalia.index.positions.PositionCodec;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.positions; package nu.marginalia.index.positions;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
@ -15,7 +16,7 @@ public class TermData {
return buffer.get(0); return buffer.get(0);
} }
public GammaCodedSequence positions() { public CodedSequence positions() {
return new GammaCodedSequence(buffer, 1, buffer.capacity()); return new GammaCodedSequence(buffer, 1, buffer.capacity());
} }
} }

View File

@ -22,7 +22,7 @@ import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList; import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -75,7 +75,7 @@ public class IndexResultRankingService {
// thrashing in there; out here we can rely on implicit array ordering to match up the data. // thrashing in there; out here we can rely on implicit array ordering to match up the data.
long[] flags = new long[termCount]; long[] flags = new long[termCount];
GammaCodedSequence[] positions = new GammaCodedSequence[termCount]; CodedSequence[] positions = new CodedSequence[termCount];
// Iterate over documents by their index in the combinedDocIds, as we need the index for the // Iterate over documents by their index in the combinedDocIds, as we need the index for the
// term data arrays as well // term data arrays as well

View File

@ -16,7 +16,7 @@ import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations; import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable; import javax.annotation.Nullable;
@ -50,10 +50,10 @@ public class IndexResultScoreCalculator {
public SearchResultItem calculateScore(long combinedId, public SearchResultItem calculateScore(long combinedId,
QuerySearchTerms searchTerms, QuerySearchTerms searchTerms,
long[] wordFlags, long[] wordFlags,
GammaCodedSequence[] positions) CodedSequence[] positions)
{ {
CompiledQuery<GammaCodedSequence> positionsQuery = compiledQuery.root.newQuery(positions); CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
int[] counts = new int[compiledQuery.size()]; int[] counts = new int[compiledQuery.size()];
@ -116,7 +116,7 @@ public class IndexResultScoreCalculator {
return false; return false;
} }
private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) { private boolean hasPrioTerm(QuerySearchTerms searchTerms, CodedSequence[] positions) {
var allTerms = searchTerms.termIdsAll; var allTerms = searchTerms.termIdsAll;
var prioTerms = searchTerms.termIdsPrio; var prioTerms = searchTerms.termIdsPrio;
@ -166,7 +166,7 @@ public class IndexResultScoreCalculator {
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery, public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
CompiledQueryInt positionsCountQuery, CompiledQueryInt positionsCountQuery,
CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata, CompiledQuery<CodedSequence> positionsQuery, long documentMetadata,
int features, int features,
int length, int length,
int bestCoherence, int bestCoherence,
@ -305,7 +305,7 @@ public class IndexResultScoreCalculator {
} }
public static double calculateAvgMinDistance(CompiledQuery<GammaCodedSequence> positions, ResultRankingContext ctx) { public static double calculateAvgMinDistance(CompiledQuery<CodedSequence> positions, ResultRankingContext ctx) {
double sum = 0; double sum = 0;
int cnt = 0; int cnt = 0;

View File

@ -4,7 +4,7 @@ import it.unimi.dsi.fastutil.ints.IntIterator;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.ids.TermIdList; import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations; import nu.marginalia.sequence.SequenceOperations;
import java.util.ArrayList; import java.util.ArrayList;
@ -28,7 +28,7 @@ public class TermCoherenceGroupList {
} }
} }
public boolean testMandatory(GammaCodedSequence[] positions) { public boolean testMandatory(CodedSequence[] positions) {
for (var coherenceSet : mandatoryGroups) { for (var coherenceSet : mandatoryGroups) {
if (!coherenceSet.test(positions)) { if (!coherenceSet.test(positions)) {
return false; return false;
@ -38,7 +38,7 @@ public class TermCoherenceGroupList {
return true; return true;
} }
public int testOptional(GammaCodedSequence[] positions) { public int testOptional(CodedSequence[] positions) {
int best = 0; int best = 0;
for (var coherenceSet : mandatoryGroups) { for (var coherenceSet : mandatoryGroups) {
if (coherenceSet.test(positions)) { if (coherenceSet.test(positions)) {
@ -71,7 +71,7 @@ public class TermCoherenceGroupList {
} }
} }
public boolean test(GammaCodedSequence[] positions) { public boolean test(CodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()]; IntIterator[] sequences = new IntIterator[present.cardinality()];
for (int oi = 0, si = 0; oi < offsets.length; oi++) { for (int oi = 0, si = 0; oi < offsets.length; oi++) {

View File

@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import nu.marginalia.index.positions.TermData; import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList; import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
import javax.annotation.Nullable; import javax.annotation.Nullable;
@ -25,7 +25,7 @@ public class TermMetadataForCombinedDocumentIds {
} }
@Nullable @Nullable
public GammaCodedSequence getPositions(long termId, long combinedId) { public CodedSequence getPositions(long termId, long combinedId) {
var metaByCombinedId = termdocToMeta.get(termId); var metaByCombinedId = termdocToMeta.get(termId);
if (metaByCombinedId == null) { if (metaByCombinedId == null) {

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.results.model.ids; package nu.marginalia.index.results.model.ids;
import nu.marginalia.index.positions.TermData; import nu.marginalia.index.positions.TermData;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.util.Arrays; import java.util.Arrays;
@ -28,7 +28,7 @@ public final class TermMetadataList {
* may be null if the term is not in the document * may be null if the term is not in the document
*/ */
@Nullable @Nullable
public GammaCodedSequence position(int i) { public CodedSequence position(int i) {
if (array[i] == null) if (array[i] == null)
return null; return null;

View File

@ -0,0 +1,23 @@
package nu.marginalia.sequence;
import blue.strategic.parquet.BinarySerializable;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import java.nio.ByteBuffer;
public interface CodedSequence extends BinarySerializable {
byte[] bytes();
IntIterator iterator();
IntIterator offsetIterator(int offset);
IntList values();
ByteBuffer buffer();
int bufferSize();
int valueCount();
}

View File

@ -1,6 +1,5 @@
package nu.marginalia.sequence; package nu.marginalia.sequence;
import blue.strategic.parquet.BinarySerializable;
import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
@ -16,7 +15,7 @@ import java.util.StringJoiner;
* and offers convenience methods for decoding and iterating * and offers convenience methods for decoding and iterating
* over the data. * over the data.
* */ * */
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> { public class GammaCodedSequence implements Iterable<Integer>, CodedSequence {
private final ByteBuffer raw; private final ByteBuffer raw;
private final int startPos; private final int startPos;

View File

@ -6,6 +6,7 @@ import blue.strategic.parquet.ValueWriter;
import gnu.trove.list.TLongList; import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList; import gnu.trove.list.array.TLongArrayList;
import lombok.*; import lombok.*;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Types; import org.apache.parquet.schema.Types;
@ -59,7 +60,7 @@ public class DocumentRecord {
@Nullable @Nullable
public TLongList metas; public TLongList metas;
@Nullable @Nullable
public List<GammaCodedSequence> positions; public List<CodedSequence> positions;
public static Hydrator<DocumentRecord, DocumentRecord> newHydrator() { public static Hydrator<DocumentRecord, DocumentRecord> newHydrator() {
return new DocumentDataHydrator(); return new DocumentDataHydrator();

View File

@ -4,6 +4,7 @@ import blue.strategic.parquet.Hydrator;
import gnu.trove.list.TLongList; import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList; import gnu.trove.list.array.TLongArrayList;
import lombok.*; import lombok.*;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
@ -30,7 +31,7 @@ public class DocumentRecordKeywordsProjection {
public List<String> words; public List<String> words;
public TLongList metas; public TLongList metas;
public List<GammaCodedSequence> positions; public List<CodedSequence> positions;
public boolean hasKeywords() { public boolean hasKeywords() {
return words != null && metas != null; return words != null && metas != null;

View File

@ -2,6 +2,7 @@ package nu.marginalia.io.processed;
import gnu.trove.list.array.TLongArrayList; import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.model.processed.DocumentRecord; import nu.marginalia.model.processed.DocumentRecord;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
@ -73,7 +74,7 @@ class DocumentRecordParquetFileReaderTest {
TLongArrayList metas = new TLongArrayList(LongStream.range(0, 100000).toArray()); TLongArrayList metas = new TLongArrayList(LongStream.range(0, 100000).toArray());
ByteBuffer workArea = ByteBuffer.allocate(1024); ByteBuffer workArea = ByteBuffer.allocate(1024);
List<GammaCodedSequence> poses = Stream.generate(() -> GammaCodedSequence.generate(workArea, 3, 4)).limit(100000).toList(); List<CodedSequence> poses = Stream.generate(() -> (CodedSequence) GammaCodedSequence.generate(workArea, 3, 4)).limit(100000).toList();
var doc = new DocumentRecord( var doc = new DocumentRecord(
"www.marginalia.nu", "www.marginalia.nu",

View File

@ -16,7 +16,7 @@ import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.processed.DocumentRecord; import nu.marginalia.model.processed.DocumentRecord;
import nu.marginalia.model.processed.DomainLinkRecord; import nu.marginalia.model.processed.DomainLinkRecord;
import nu.marginalia.model.processed.DomainRecord; import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -130,7 +130,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
var wb = document.words.build(workArea); var wb = document.words.build(workArea);
List<String> words = Arrays.asList(wb.keywords); List<String> words = Arrays.asList(wb.keywords);
TLongArrayList metas = new TLongArrayList(wb.metadata); TLongArrayList metas = new TLongArrayList(wb.metadata);
List<GammaCodedSequence> positions = Arrays.asList(wb.positions); List<CodedSequence> positions = Arrays.asList(wb.positions);
documentWriter.write(new DocumentRecord( documentWriter.write(new DocumentRecord(
domainName, domainName,

View File

@ -10,7 +10,7 @@ import nu.marginalia.loading.domains.DomainIdRegistry;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.processed.DocumentRecordKeywordsProjection; import nu.marginalia.model.processed.DocumentRecordKeywordsProjection;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -68,7 +68,7 @@ public class KeywordLoaderService {
var words = new DocumentKeywords( var words = new DocumentKeywords(
projection.words.toArray(String[]::new), projection.words.toArray(String[]::new),
projection.metas.toArray(), projection.metas.toArray(),
projection.positions.toArray(GammaCodedSequence[]::new) projection.positions.toArray(CodedSequence[]::new)
); );
writer.putWords(combinedId, writer.putWords(combinedId,