mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(coded-sequence) Replace GCS usage with an interface
This commit is contained in:
parent
5c098005cc
commit
0b31c4cfbb
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.keyword.model;
|
package nu.marginalia.keyword.model;
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
|
|
||||||
import java.io.Serial;
|
import java.io.Serial;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
@ -12,11 +12,11 @@ public final class DocumentKeywords implements Serializable {
|
|||||||
|
|
||||||
public final String[] keywords;
|
public final String[] keywords;
|
||||||
public final long[] metadata;
|
public final long[] metadata;
|
||||||
public final GammaCodedSequence[] positions;
|
public final CodedSequence[] positions;
|
||||||
|
|
||||||
public DocumentKeywords(String[] keywords,
|
public DocumentKeywords(String[] keywords,
|
||||||
long[] metadata,
|
long[] metadata,
|
||||||
GammaCodedSequence[] positions)
|
CodedSequence[] positions)
|
||||||
{
|
{
|
||||||
this.keywords = keywords;
|
this.keywords = keywords;
|
||||||
this.metadata = metadata;
|
this.metadata = metadata;
|
||||||
|
@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
|||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -36,7 +37,7 @@ public class DocumentKeywordsBuilder {
|
|||||||
public DocumentKeywords build(ByteBuffer workArea) {
|
public DocumentKeywords build(ByteBuffer workArea) {
|
||||||
final String[] wordArray = new String[wordToMeta.size()];
|
final String[] wordArray = new String[wordToMeta.size()];
|
||||||
final long[] meta = new long[wordToMeta.size()];
|
final long[] meta = new long[wordToMeta.size()];
|
||||||
final GammaCodedSequence[] positions = new GammaCodedSequence[wordToMeta.size()];
|
final CodedSequence[] positions = new CodedSequence[wordToMeta.size()];
|
||||||
|
|
||||||
var iter = wordToMeta.object2LongEntrySet().fastIterator();
|
var iter = wordToMeta.object2LongEntrySet().fastIterator();
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
|||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@ -93,7 +94,7 @@ class DocumentKeywordExtractorTest {
|
|||||||
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
|
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
|
||||||
|
|
||||||
Map<String, WordMetadata> flags = new HashMap<>();
|
Map<String, WordMetadata> flags = new HashMap<>();
|
||||||
Map<String, GammaCodedSequence> positions = new HashMap<>();
|
Map<String, CodedSequence> positions = new HashMap<>();
|
||||||
|
|
||||||
for (int i = 0; i < keywordsBuilt.size(); i++) {
|
for (int i = 0; i < keywordsBuilt.size(); i++) {
|
||||||
String keyword = keywordsBuilt.keywords[i];
|
String keyword = keywordsBuilt.keywords[i];
|
||||||
|
@ -9,6 +9,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
|||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import nu.marginalia.test.TestUtil;
|
import nu.marginalia.test.TestUtil;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
@ -84,7 +85,7 @@ class ForwardIndexConverterTest {
|
|||||||
new IndexJournalEntryData(
|
new IndexJournalEntryData(
|
||||||
new String[]{},
|
new String[]{},
|
||||||
new long[]{},
|
new long[]{},
|
||||||
new GammaCodedSequence[]{}
|
new CodedSequence[]{}
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.index.journal.model;
|
package nu.marginalia.index.journal.model;
|
||||||
|
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
|
|
||||||
public record IndexJournalEntryData(long[] termIds,
|
public record IndexJournalEntryData(long[] termIds,
|
||||||
long[] metadata,
|
long[] metadata,
|
||||||
GammaCodedSequence[] positions) {
|
CodedSequence[] positions) {
|
||||||
|
|
||||||
public IndexJournalEntryData {
|
public IndexJournalEntryData {
|
||||||
assert termIds.length == metadata.length;
|
assert termIds.length == metadata.length;
|
||||||
@ -14,7 +14,7 @@ public record IndexJournalEntryData(long[] termIds,
|
|||||||
|
|
||||||
public IndexJournalEntryData(String[] keywords,
|
public IndexJournalEntryData(String[] keywords,
|
||||||
long[] metadata,
|
long[] metadata,
|
||||||
GammaCodedSequence[] positions)
|
CodedSequence[] positions)
|
||||||
{
|
{
|
||||||
this(termIds(keywords), metadata, positions);
|
this(termIds(keywords), metadata, positions);
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.index.journal.model;
|
package nu.marginalia.index.journal.model;
|
||||||
|
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
@ -17,7 +18,7 @@ public record IndexJournalEntryTermData(
|
|||||||
long metadata,
|
long metadata,
|
||||||
ByteBuffer positionsBuffer)
|
ByteBuffer positionsBuffer)
|
||||||
{
|
{
|
||||||
public GammaCodedSequence positions() {
|
public CodedSequence positions() {
|
||||||
return new GammaCodedSequence(positionsBuffer);
|
return new GammaCodedSequence(positionsBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -81,7 +81,7 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
|||||||
{
|
{
|
||||||
final long[] keywords = data.termIds();
|
final long[] keywords = data.termIds();
|
||||||
final long[] metadata = data.metadata();
|
final long[] metadata = data.metadata();
|
||||||
final GammaCodedSequence[] positions = data.positions();
|
final CodedSequence[] positions = data.positions();
|
||||||
|
|
||||||
int entrySize = 0;
|
int entrySize = 0;
|
||||||
for (var position : positions) {
|
for (var position : positions) {
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
import nu.marginalia.index.positions.PositionCodec;
|
import nu.marginalia.index.positions.PositionCodec;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.index.positions;
|
package nu.marginalia.index.positions;
|
||||||
|
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
@ -15,7 +16,7 @@ public class TermData {
|
|||||||
return buffer.get(0);
|
return buffer.get(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public GammaCodedSequence positions() {
|
public CodedSequence positions() {
|
||||||
return new GammaCodedSequence(buffer, 1, buffer.capacity());
|
return new GammaCodedSequence(buffer, 1, buffer.capacity());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,7 @@ import nu.marginalia.index.results.model.ids.TermIdList;
|
|||||||
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -75,7 +75,7 @@ public class IndexResultRankingService {
|
|||||||
// thrashing in there; out here we can rely on implicit array ordering to match up the data.
|
// thrashing in there; out here we can rely on implicit array ordering to match up the data.
|
||||||
|
|
||||||
long[] flags = new long[termCount];
|
long[] flags = new long[termCount];
|
||||||
GammaCodedSequence[] positions = new GammaCodedSequence[termCount];
|
CodedSequence[] positions = new CodedSequence[termCount];
|
||||||
|
|
||||||
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
||||||
// term data arrays as well
|
// term data arrays as well
|
||||||
|
@ -16,7 +16,7 @@ import nu.marginalia.model.idx.DocumentFlags;
|
|||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.SequenceOperations;
|
import nu.marginalia.sequence.SequenceOperations;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
@ -50,10 +50,10 @@ public class IndexResultScoreCalculator {
|
|||||||
public SearchResultItem calculateScore(long combinedId,
|
public SearchResultItem calculateScore(long combinedId,
|
||||||
QuerySearchTerms searchTerms,
|
QuerySearchTerms searchTerms,
|
||||||
long[] wordFlags,
|
long[] wordFlags,
|
||||||
GammaCodedSequence[] positions)
|
CodedSequence[] positions)
|
||||||
{
|
{
|
||||||
|
|
||||||
CompiledQuery<GammaCodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
||||||
|
|
||||||
int[] counts = new int[compiledQuery.size()];
|
int[] counts = new int[compiledQuery.size()];
|
||||||
|
|
||||||
@ -116,7 +116,7 @@ public class IndexResultScoreCalculator {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) {
|
private boolean hasPrioTerm(QuerySearchTerms searchTerms, CodedSequence[] positions) {
|
||||||
var allTerms = searchTerms.termIdsAll;
|
var allTerms = searchTerms.termIdsAll;
|
||||||
var prioTerms = searchTerms.termIdsPrio;
|
var prioTerms = searchTerms.termIdsPrio;
|
||||||
|
|
||||||
@ -166,7 +166,7 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
|
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
|
||||||
CompiledQueryInt positionsCountQuery,
|
CompiledQueryInt positionsCountQuery,
|
||||||
CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
|
CompiledQuery<CodedSequence> positionsQuery, long documentMetadata,
|
||||||
int features,
|
int features,
|
||||||
int length,
|
int length,
|
||||||
int bestCoherence,
|
int bestCoherence,
|
||||||
@ -305,7 +305,7 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static double calculateAvgMinDistance(CompiledQuery<GammaCodedSequence> positions, ResultRankingContext ctx) {
|
public static double calculateAvgMinDistance(CompiledQuery<CodedSequence> positions, ResultRankingContext ctx) {
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import it.unimi.dsi.fastutil.ints.IntIterator;
|
|||||||
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||||
import nu.marginalia.index.model.SearchTermsUtil;
|
import nu.marginalia.index.model.SearchTermsUtil;
|
||||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.SequenceOperations;
|
import nu.marginalia.sequence.SequenceOperations;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -28,7 +28,7 @@ public class TermCoherenceGroupList {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean testMandatory(GammaCodedSequence[] positions) {
|
public boolean testMandatory(CodedSequence[] positions) {
|
||||||
for (var coherenceSet : mandatoryGroups) {
|
for (var coherenceSet : mandatoryGroups) {
|
||||||
if (!coherenceSet.test(positions)) {
|
if (!coherenceSet.test(positions)) {
|
||||||
return false;
|
return false;
|
||||||
@ -38,7 +38,7 @@ public class TermCoherenceGroupList {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int testOptional(GammaCodedSequence[] positions) {
|
public int testOptional(CodedSequence[] positions) {
|
||||||
int best = 0;
|
int best = 0;
|
||||||
for (var coherenceSet : mandatoryGroups) {
|
for (var coherenceSet : mandatoryGroups) {
|
||||||
if (coherenceSet.test(positions)) {
|
if (coherenceSet.test(positions)) {
|
||||||
@ -71,7 +71,7 @@ public class TermCoherenceGroupList {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean test(GammaCodedSequence[] positions) {
|
public boolean test(CodedSequence[] positions) {
|
||||||
IntIterator[] sequences = new IntIterator[present.cardinality()];
|
IntIterator[] sequences = new IntIterator[present.cardinality()];
|
||||||
|
|
||||||
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
||||||
|
@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
|
|||||||
import nu.marginalia.index.positions.TermData;
|
import nu.marginalia.index.positions.TermData;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
@ -25,7 +25,7 @@ public class TermMetadataForCombinedDocumentIds {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public GammaCodedSequence getPositions(long termId, long combinedId) {
|
public CodedSequence getPositions(long termId, long combinedId) {
|
||||||
var metaByCombinedId = termdocToMeta.get(termId);
|
var metaByCombinedId = termdocToMeta.get(termId);
|
||||||
|
|
||||||
if (metaByCombinedId == null) {
|
if (metaByCombinedId == null) {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.index.results.model.ids;
|
package nu.marginalia.index.results.model.ids;
|
||||||
|
|
||||||
import nu.marginalia.index.positions.TermData;
|
import nu.marginalia.index.positions.TermData;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -28,7 +28,7 @@ public final class TermMetadataList {
|
|||||||
* may be null if the term is not in the document
|
* may be null if the term is not in the document
|
||||||
*/
|
*/
|
||||||
@Nullable
|
@Nullable
|
||||||
public GammaCodedSequence position(int i) {
|
public CodedSequence position(int i) {
|
||||||
if (array[i] == null)
|
if (array[i] == null)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
|
@ -0,0 +1,23 @@
|
|||||||
|
package nu.marginalia.sequence;
|
||||||
|
|
||||||
|
import blue.strategic.parquet.BinarySerializable;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
|
||||||
|
public interface CodedSequence extends BinarySerializable {
|
||||||
|
byte[] bytes();
|
||||||
|
|
||||||
|
IntIterator iterator();
|
||||||
|
|
||||||
|
IntIterator offsetIterator(int offset);
|
||||||
|
|
||||||
|
IntList values();
|
||||||
|
|
||||||
|
ByteBuffer buffer();
|
||||||
|
|
||||||
|
int bufferSize();
|
||||||
|
|
||||||
|
int valueCount();
|
||||||
|
}
|
@ -1,6 +1,5 @@
|
|||||||
package nu.marginalia.sequence;
|
package nu.marginalia.sequence;
|
||||||
|
|
||||||
import blue.strategic.parquet.BinarySerializable;
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
@ -16,7 +15,7 @@ import java.util.StringJoiner;
|
|||||||
* and offers convenience methods for decoding and iterating
|
* and offers convenience methods for decoding and iterating
|
||||||
* over the data.
|
* over the data.
|
||||||
* */
|
* */
|
||||||
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> {
|
public class GammaCodedSequence implements Iterable<Integer>, CodedSequence {
|
||||||
private final ByteBuffer raw;
|
private final ByteBuffer raw;
|
||||||
|
|
||||||
private final int startPos;
|
private final int startPos;
|
||||||
|
@ -6,6 +6,7 @@ import blue.strategic.parquet.ValueWriter;
|
|||||||
import gnu.trove.list.TLongList;
|
import gnu.trove.list.TLongList;
|
||||||
import gnu.trove.list.array.TLongArrayList;
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
import lombok.*;
|
import lombok.*;
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import org.apache.parquet.schema.MessageType;
|
import org.apache.parquet.schema.MessageType;
|
||||||
import org.apache.parquet.schema.Types;
|
import org.apache.parquet.schema.Types;
|
||||||
@ -59,7 +60,7 @@ public class DocumentRecord {
|
|||||||
@Nullable
|
@Nullable
|
||||||
public TLongList metas;
|
public TLongList metas;
|
||||||
@Nullable
|
@Nullable
|
||||||
public List<GammaCodedSequence> positions;
|
public List<CodedSequence> positions;
|
||||||
|
|
||||||
public static Hydrator<DocumentRecord, DocumentRecord> newHydrator() {
|
public static Hydrator<DocumentRecord, DocumentRecord> newHydrator() {
|
||||||
return new DocumentDataHydrator();
|
return new DocumentDataHydrator();
|
||||||
|
@ -4,6 +4,7 @@ import blue.strategic.parquet.Hydrator;
|
|||||||
import gnu.trove.list.TLongList;
|
import gnu.trove.list.TLongList;
|
||||||
import gnu.trove.list.array.TLongArrayList;
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
import lombok.*;
|
import lombok.*;
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
@ -30,7 +31,7 @@ public class DocumentRecordKeywordsProjection {
|
|||||||
|
|
||||||
public List<String> words;
|
public List<String> words;
|
||||||
public TLongList metas;
|
public TLongList metas;
|
||||||
public List<GammaCodedSequence> positions;
|
public List<CodedSequence> positions;
|
||||||
|
|
||||||
public boolean hasKeywords() {
|
public boolean hasKeywords() {
|
||||||
return words != null && metas != null;
|
return words != null && metas != null;
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.io.processed;
|
|||||||
|
|
||||||
import gnu.trove.list.array.TLongArrayList;
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
import nu.marginalia.model.processed.DocumentRecord;
|
import nu.marginalia.model.processed.DocumentRecord;
|
||||||
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
@ -73,7 +74,7 @@ class DocumentRecordParquetFileReaderTest {
|
|||||||
TLongArrayList metas = new TLongArrayList(LongStream.range(0, 100000).toArray());
|
TLongArrayList metas = new TLongArrayList(LongStream.range(0, 100000).toArray());
|
||||||
|
|
||||||
ByteBuffer workArea = ByteBuffer.allocate(1024);
|
ByteBuffer workArea = ByteBuffer.allocate(1024);
|
||||||
List<GammaCodedSequence> poses = Stream.generate(() -> GammaCodedSequence.generate(workArea, 3, 4)).limit(100000).toList();
|
List<CodedSequence> poses = Stream.generate(() -> (CodedSequence) GammaCodedSequence.generate(workArea, 3, 4)).limit(100000).toList();
|
||||||
|
|
||||||
var doc = new DocumentRecord(
|
var doc = new DocumentRecord(
|
||||||
"www.marginalia.nu",
|
"www.marginalia.nu",
|
||||||
|
@ -16,7 +16,7 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
|||||||
import nu.marginalia.model.processed.DocumentRecord;
|
import nu.marginalia.model.processed.DocumentRecord;
|
||||||
import nu.marginalia.model.processed.DomainLinkRecord;
|
import nu.marginalia.model.processed.DomainLinkRecord;
|
||||||
import nu.marginalia.model.processed.DomainRecord;
|
import nu.marginalia.model.processed.DomainRecord;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
var wb = document.words.build(workArea);
|
var wb = document.words.build(workArea);
|
||||||
List<String> words = Arrays.asList(wb.keywords);
|
List<String> words = Arrays.asList(wb.keywords);
|
||||||
TLongArrayList metas = new TLongArrayList(wb.metadata);
|
TLongArrayList metas = new TLongArrayList(wb.metadata);
|
||||||
List<GammaCodedSequence> positions = Arrays.asList(wb.positions);
|
List<CodedSequence> positions = Arrays.asList(wb.positions);
|
||||||
|
|
||||||
documentWriter.write(new DocumentRecord(
|
documentWriter.write(new DocumentRecord(
|
||||||
domainName,
|
domainName,
|
||||||
|
@ -10,7 +10,7 @@ import nu.marginalia.loading.domains.DomainIdRegistry;
|
|||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.processed.DocumentRecordKeywordsProjection;
|
import nu.marginalia.model.processed.DocumentRecordKeywordsProjection;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ public class KeywordLoaderService {
|
|||||||
var words = new DocumentKeywords(
|
var words = new DocumentKeywords(
|
||||||
projection.words.toArray(String[]::new),
|
projection.words.toArray(String[]::new),
|
||||||
projection.metas.toArray(),
|
projection.metas.toArray(),
|
||||||
projection.positions.toArray(GammaCodedSequence[]::new)
|
projection.positions.toArray(CodedSequence[]::new)
|
||||||
);
|
);
|
||||||
|
|
||||||
writer.putWords(combinedId,
|
writer.putWords(combinedId,
|
||||||
|
Loading…
Reference in New Issue
Block a user