(coded-sequence) Replace GCS usage with an interface

This commit is contained in:
Viktor Lofgren 2024-07-16 14:37:50 +02:00
parent 5c098005cc
commit 0b31c4cfbb
21 changed files with 68 additions and 39 deletions

View File

@ -1,6 +1,6 @@
package nu.marginalia.keyword.model;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
import java.io.Serial;
import java.io.Serializable;
@ -12,11 +12,11 @@ public final class DocumentKeywords implements Serializable {
public final String[] keywords;
public final long[] metadata;
public final GammaCodedSequence[] positions;
public final CodedSequence[] positions;
public DocumentKeywords(String[] keywords,
long[] metadata,
GammaCodedSequence[] positions)
CodedSequence[] positions)
{
this.keywords = keywords;
this.metadata = metadata;

View File

@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -36,7 +37,7 @@ public class DocumentKeywordsBuilder {
public DocumentKeywords build(ByteBuffer workArea) {
final String[] wordArray = new String[wordToMeta.size()];
final long[] meta = new long[wordToMeta.size()];
final GammaCodedSequence[] positions = new GammaCodedSequence[wordToMeta.size()];
final CodedSequence[] positions = new CodedSequence[wordToMeta.size()];
var iter = wordToMeta.object2LongEntrySet().fastIterator();

View File

@ -5,6 +5,7 @@ import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup;
@ -93,7 +94,7 @@ class DocumentKeywordExtractorTest {
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
Map<String, WordMetadata> flags = new HashMap<>();
Map<String, GammaCodedSequence> positions = new HashMap<>();
Map<String, CodedSequence> positions = new HashMap<>();
for (int i = 0; i < keywordsBuilt.size(); i++) {
String keyword = keywordsBuilt.keywords[i];

View File

@ -9,6 +9,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.AfterEach;
@ -84,7 +85,7 @@ class ForwardIndexConverterTest {
new IndexJournalEntryData(
new String[]{},
new long[]{},
new GammaCodedSequence[]{}
new CodedSequence[]{}
)
);
}

View File

@ -1,11 +1,11 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
public record IndexJournalEntryData(long[] termIds,
long[] metadata,
GammaCodedSequence[] positions) {
CodedSequence[] positions) {
public IndexJournalEntryData {
assert termIds.length == metadata.length;
@ -14,7 +14,7 @@ public record IndexJournalEntryData(long[] termIds,
public IndexJournalEntryData(String[] keywords,
long[] metadata,
GammaCodedSequence[] positions)
CodedSequence[] positions)
{
this(termIds(keywords), metadata, positions);
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import java.nio.ByteBuffer;
@ -17,7 +18,7 @@ public record IndexJournalEntryTermData(
long metadata,
ByteBuffer positionsBuffer)
{
public GammaCodedSequence positions() {
public CodedSequence positions() {
return new GammaCodedSequence(positionsBuffer);
}

View File

@ -5,7 +5,7 @@ import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -81,7 +81,7 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
{
final long[] keywords = data.termIds();
final long[] metadata = data.metadata();
final GammaCodedSequence[] positions = data.positions();
final CodedSequence[] positions = data.positions();
int entrySize = 0;
for (var position : positions) {

View File

@ -1,7 +1,6 @@
package nu.marginalia.index.construction;
import nu.marginalia.index.positions.PositionCodec;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException;
import java.nio.ByteBuffer;

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.positions;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import java.nio.ByteBuffer;
@ -15,7 +16,7 @@ public class TermData {
return buffer.get(0);
}
public GammaCodedSequence positions() {
public CodedSequence positions() {
return new GammaCodedSequence(buffer, 1, buffer.capacity());
}
}

View File

@ -22,7 +22,7 @@ import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -75,7 +75,7 @@ public class IndexResultRankingService {
// thrashing in there; out here we can rely on implicit array ordering to match up the data.
long[] flags = new long[termCount];
GammaCodedSequence[] positions = new GammaCodedSequence[termCount];
CodedSequence[] positions = new CodedSequence[termCount];
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
// term data arrays as well

View File

@ -16,7 +16,7 @@ import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable;
@ -50,10 +50,10 @@ public class IndexResultScoreCalculator {
public SearchResultItem calculateScore(long combinedId,
QuerySearchTerms searchTerms,
long[] wordFlags,
GammaCodedSequence[] positions)
CodedSequence[] positions)
{
CompiledQuery<GammaCodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
int[] counts = new int[compiledQuery.size()];
@ -116,7 +116,7 @@ public class IndexResultScoreCalculator {
return false;
}
private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) {
private boolean hasPrioTerm(QuerySearchTerms searchTerms, CodedSequence[] positions) {
var allTerms = searchTerms.termIdsAll;
var prioTerms = searchTerms.termIdsPrio;
@ -166,7 +166,7 @@ public class IndexResultScoreCalculator {
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
CompiledQueryInt positionsCountQuery,
CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
CompiledQuery<CodedSequence> positionsQuery, long documentMetadata,
int features,
int length,
int bestCoherence,
@ -305,7 +305,7 @@ public class IndexResultScoreCalculator {
}
public static double calculateAvgMinDistance(CompiledQuery<GammaCodedSequence> positions, ResultRankingContext ctx) {
public static double calculateAvgMinDistance(CompiledQuery<CodedSequence> positions, ResultRankingContext ctx) {
double sum = 0;
int cnt = 0;

View File

@ -4,7 +4,7 @@ import it.unimi.dsi.fastutil.ints.IntIterator;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;
import java.util.ArrayList;
@ -28,7 +28,7 @@ public class TermCoherenceGroupList {
}
}
public boolean testMandatory(GammaCodedSequence[] positions) {
public boolean testMandatory(CodedSequence[] positions) {
for (var coherenceSet : mandatoryGroups) {
if (!coherenceSet.test(positions)) {
return false;
@ -38,7 +38,7 @@ public class TermCoherenceGroupList {
return true;
}
public int testOptional(GammaCodedSequence[] positions) {
public int testOptional(CodedSequence[] positions) {
int best = 0;
for (var coherenceSet : mandatoryGroups) {
if (coherenceSet.test(positions)) {
@ -71,7 +71,7 @@ public class TermCoherenceGroupList {
}
}
public boolean test(GammaCodedSequence[] positions) {
public boolean test(CodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {

View File

@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
import javax.annotation.Nullable;
@ -25,7 +25,7 @@ public class TermMetadataForCombinedDocumentIds {
}
@Nullable
public GammaCodedSequence getPositions(long termId, long combinedId) {
public CodedSequence getPositions(long termId, long combinedId) {
var metaByCombinedId = termdocToMeta.get(termId);
if (metaByCombinedId == null) {

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.results.model.ids;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
import javax.annotation.Nullable;
import java.util.Arrays;
@ -28,7 +28,7 @@ public final class TermMetadataList {
* may be null if the term is not in the document
*/
@Nullable
public GammaCodedSequence position(int i) {
public CodedSequence position(int i) {
if (array[i] == null)
return null;

View File

@ -0,0 +1,23 @@
package nu.marginalia.sequence;
import blue.strategic.parquet.BinarySerializable;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import java.nio.ByteBuffer;
public interface CodedSequence extends BinarySerializable {
byte[] bytes();
IntIterator iterator();
IntIterator offsetIterator(int offset);
IntList values();
ByteBuffer buffer();
int bufferSize();
int valueCount();
}

View File

@ -1,6 +1,5 @@
package nu.marginalia.sequence;
import blue.strategic.parquet.BinarySerializable;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
@ -16,7 +15,7 @@ import java.util.StringJoiner;
* and offers convenience methods for decoding and iterating
* over the data.
* */
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> {
public class GammaCodedSequence implements Iterable<Integer>, CodedSequence {
private final ByteBuffer raw;
private final int startPos;

View File

@ -6,6 +6,7 @@ import blue.strategic.parquet.ValueWriter;
import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import lombok.*;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Types;
@ -59,7 +60,7 @@ public class DocumentRecord {
@Nullable
public TLongList metas;
@Nullable
public List<GammaCodedSequence> positions;
public List<CodedSequence> positions;
public static Hydrator<DocumentRecord, DocumentRecord> newHydrator() {
return new DocumentDataHydrator();

View File

@ -4,6 +4,7 @@ import blue.strategic.parquet.Hydrator;
import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import lombok.*;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import org.jetbrains.annotations.NotNull;
@ -30,7 +31,7 @@ public class DocumentRecordKeywordsProjection {
public List<String> words;
public TLongList metas;
public List<GammaCodedSequence> positions;
public List<CodedSequence> positions;
public boolean hasKeywords() {
return words != null && metas != null;

View File

@ -2,6 +2,7 @@ package nu.marginalia.io.processed;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.model.processed.DocumentRecord;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
@ -73,7 +74,7 @@ class DocumentRecordParquetFileReaderTest {
TLongArrayList metas = new TLongArrayList(LongStream.range(0, 100000).toArray());
ByteBuffer workArea = ByteBuffer.allocate(1024);
List<GammaCodedSequence> poses = Stream.generate(() -> GammaCodedSequence.generate(workArea, 3, 4)).limit(100000).toList();
List<CodedSequence> poses = Stream.generate(() -> (CodedSequence) GammaCodedSequence.generate(workArea, 3, 4)).limit(100000).toList();
var doc = new DocumentRecord(
"www.marginalia.nu",

View File

@ -16,7 +16,7 @@ import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.processed.DocumentRecord;
import nu.marginalia.model.processed.DomainLinkRecord;
import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -130,7 +130,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
var wb = document.words.build(workArea);
List<String> words = Arrays.asList(wb.keywords);
TLongArrayList metas = new TLongArrayList(wb.metadata);
List<GammaCodedSequence> positions = Arrays.asList(wb.positions);
List<CodedSequence> positions = Arrays.asList(wb.positions);
documentWriter.write(new DocumentRecord(
domainName,

View File

@ -10,7 +10,7 @@ import nu.marginalia.loading.domains.DomainIdRegistry;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.processed.DocumentRecordKeywordsProjection;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -68,7 +68,7 @@ public class KeywordLoaderService {
var words = new DocumentKeywords(
projection.words.toArray(String[]::new),
projection.metas.toArray(),
projection.positions.toArray(GammaCodedSequence[]::new)
projection.positions.toArray(CodedSequence[]::new)
);
writer.putWords(combinedId,