(wip) Extract and encode spans data

Refactoring keyword extraction to extract spans information.

Modifying the intermediate storage of converted data to use the new slop library, which is allows for easier storage of ad-hoc binary data like spans and positions.

This is a bit of a katamari damacy commit that ended up dragging along a bunch of other fairly tangentially related changes that are hard to break out into separate commits after the fact.  Will push as-is to get back to being able to do more isolated work.
This commit is contained in:
Viktor Lofgren 2024-07-27 11:44:13 +02:00
parent 52a9a0d410
commit aebb2652e8
221 changed files with 2584 additions and 4613 deletions

View File

@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:coded-sequence')
implementation libs.bundles.slf4j

View File

@ -0,0 +1,32 @@
package nu.marginalia.model.idx;
import nu.marginalia.sequence.CodedSequence;
import java.util.List;
public record CodedWordSpan(byte code, CodedSequence spans) {
public static SplitSpansList fromSplit(String codes, List<CodedSequence> spans) {
return new SplitSpansList(codes, spans);
}
public static SplitSpansList split(List<CodedWordSpan> spanList) {
return new SplitSpansList(
spanList.stream()
.map(CodedWordSpan::code)
.collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(),
spanList.stream()
.map(CodedWordSpan::spans)
.toList()
);
}
public record SplitSpansList(String codes, List<CodedSequence> spans) {
public List<CodedWordSpan> unite() {
if (null == codes) {
return List.of();
}
else {
return codes.chars().mapToObj(c -> new CodedWordSpan((byte) c, spans.get(codes.indexOf(c)))).toList();
}
}
}
}

View File

@ -38,19 +38,27 @@ public enum WordFlags {
ExternalLink
;
public int asBit() {
return 1 << ordinal();
public byte asBit() {
return (byte) (1 << ordinal());
}
public boolean isPresent(long value) {
public boolean isPresent(byte value) {
return (asBit() & value) > 0;
}
public boolean isAbsent(long value) {
public boolean isAbsent(byte value) {
return (asBit() & value) == 0;
}
public static EnumSet<WordFlags> decode(long encodedValue) {
public static byte encode(EnumSet<WordFlags> flags) {
byte ret = 0;
for (WordFlags f : flags) {
ret |= f.asBit();
}
return ret;
}
public static EnumSet<WordFlags> decode(byte encodedValue) {
EnumSet<WordFlags> ret = EnumSet.noneOf(WordFlags.class);
for (WordFlags f : values()) {

View File

@ -1,89 +0,0 @@
package nu.marginalia.model.idx;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import java.util.EnumSet;
import java.util.Set;
/** Word level metadata designed to fit in a single 64 bit long.
*
* @param positions bitmask of term positions within the document
* @param flags word flags (see {@link WordFlags})
*/
public record WordMetadata(long positions,
int flags) {
public static final long FLAGS_MASK = (1L << WordFlags.values().length) - 1;
public static final int POSITIONS_COUNT = 64 - WordFlags.values().length;
public static final int POSITIONS_SHIFT = WordFlags.values().length;
public static final long POSITIONS_MASK = ~0L >>> POSITIONS_SHIFT;
public WordMetadata() {
this(emptyValue());
}
public WordMetadata(long value) {
this(
((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
(int)(value & FLAGS_MASK)
);
}
public WordMetadata(long positions,
Set<WordFlags> flags)
{
this(positions, encodeFlags(flags));
}
private static int encodeFlags(Set<WordFlags> flags) {
int ret = 0;
for (var flag : flags) { ret |= flag.asBit(); }
return ret;
}
public static boolean hasFlags(long encoded, long metadataBitMask) {
return (encoded & metadataBitMask) == metadataBitMask;
}
public static boolean hasAnyFlags(long encoded, long metadataBitMask) {
return (encoded & metadataBitMask) != 0;
}
public static long decodePositions(long meta) {
return (meta >>> POSITIONS_SHIFT) & POSITIONS_MASK;
}
public boolean hasFlag(WordFlags flag) {
return (flags & flag.asBit()) != 0;
}
public String toString() {
return "[positions=%s; %s]".formatted(BrailleBlockPunchCards.printBits(positions, 56), flagSet());
}
/* Encoded in a 64 bit long
*/
public long encode() {
long ret = 0;
ret |= Integer.toUnsignedLong(flags) & FLAGS_MASK;
ret |= (positions & POSITIONS_MASK) << POSITIONS_SHIFT;
return ret;
}
public boolean isEmpty() {
return positions == 0 && flags == 0;
}
public static long emptyValue() {
return 0L;
}
public EnumSet<WordFlags> flagSet() {
return WordFlags.decode(flags);
}
}

View File

@ -1,41 +0,0 @@
package nu.marginalia.model;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import org.junit.jupiter.api.Test;
import java.util.EnumSet;
import static org.junit.jupiter.api.Assertions.assertEquals;
class WordMetadataTest {
@Test
public void codecTest() {
verifyCodec("Vanilla case", new WordMetadata(0x7f0f0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("Position 32bit", new WordMetadata(0xff0f0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("Position all", new WordMetadata(0xffff_ff0f_0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("No flags", new WordMetadata( 0xff0f0000L, EnumSet.noneOf(WordFlags.class)));
verifyCodec("No flags, some bits", new WordMetadata(0x3f_7f7f_7f7f_7f7fL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("No flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("All flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0x7f0f0005L, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0xff0f0013L, EnumSet.noneOf(WordFlags.class)));
System.out.println(new WordMetadata(0xf0f000ff0f0013L, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0xf0f000ff0f0013L, (byte)-1));
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(~0L, (byte) 0).encode(), 64));
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(0, (byte) 0xff).encode(), 64));
System.out.println(BrailleBlockPunchCards.printBits(131973L, 64));
System.out.println(new WordMetadata(131973L));
}
public void verifyCodec(String message, WordMetadata data) {
System.out.println(BrailleBlockPunchCards.printBits(data.encode(), 64));
assertEquals(data, new WordMetadata(data.encode()), message);
}
}

View File

@ -38,15 +38,15 @@ dependencies {
implementation project(':code:functions:search-query')
implementation project(':code:execution:api')
implementation project(':code:process-models:crawl-spec')
implementation project(':code:process-models:crawling-model')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:data-extractors')
implementation project(':code:features-convert:stackexchange-xml')
implementation project(':code:features-convert:reddit-json')
implementation project(':code:index:index-journal')
implementation project(':code:index:api')
implementation project(':code:process-mqapi')
implementation project(':code:processes:process-mq-api')
implementation project(':third-party:encyclopedia-marginalia-nu')
implementation libs.bundles.slf4j

View File

@ -6,19 +6,11 @@ import com.google.inject.Singleton;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.With;
import nu.marginalia.IndexLocations;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.actor.state.Resume;
import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.svc.BackupService;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.index.api.IndexMqClient;
import nu.marginalia.index.api.IndexMqEndpoints;
import nu.marginalia.mq.MqMessageState;
@ -27,9 +19,20 @@ import nu.marginalia.mqapi.converting.ConvertRequest;
import nu.marginalia.mqapi.index.CreateIndexRequest;
import nu.marginalia.mqapi.index.IndexName;
import nu.marginalia.mqapi.loading.LoadRequest;
import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.svc.BackupService;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.sql.SQLException;
import java.util.List;
@ -113,6 +116,21 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
yield new Load(List.of(processedId));
}
case Load(List<FileStorageId> processedIds, long msgId) when msgId < 0 -> {
// clear the output directory of the loader from any debris from partial jobs that have been aborted
Files.list(IndexLocations.getIndexConstructionArea(storageService)).forEach(path -> {
try {
if (Files.isDirectory(path)) {
FileUtils.deleteDirectory(path.toFile());
}
else if (Files.isRegularFile(path)) {
Files.delete(path);
}
} catch (Exception e) {
logger.error("Error clearing staging area", e);
}
});
long id = mqLoaderOutbox.sendAsync(new LoadRequest(processedIds));
yield new Load(processedIds, id);

View File

@ -2,22 +2,25 @@ package nu.marginalia.svc;
import com.github.luben.zstd.ZstdInputStream;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.linkdb.LinkdbFileNames;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import com.google.inject.Inject;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Optional;
public class BackupService {
@ -97,35 +100,20 @@ public class BackupService {
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException
{
for (var source : IndexJournalFileNames.findJournalFiles(inputStorage)) {
var dest = backupStorage.resolve(source.toFile().getName());
try (var is = Files.newInputStream(source);
var os = Files.newOutputStream(dest)
) {
IOUtils.copyLarge(is, os);
}
Optional<IndexJournal> journal = IndexJournal.findJournal(inputStorage);
if (journal.isEmpty()) {
throw new FileNotFoundException("No journal found in input storage");
}
FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile());
}
private void restoreJournal(Path destStorage, Path backupStorage) throws IOException {
// Remove any old journal files first to avoid them getting loaded
for (var garbage : IndexJournalFileNames.findJournalFiles(destStorage)) {
Files.delete(garbage);
Optional<IndexJournal> journal = IndexJournal.findJournal(backupStorage);
if (journal.isEmpty()) {
throw new FileNotFoundException("No journal found in backup");
}
for (var source : IndexJournalFileNames.findJournalFiles(backupStorage)) {
var dest = destStorage.resolve(source.toFile().getName());
try (var is = Files.newInputStream(source);
var os = Files.newOutputStream(dest)
) {
IOUtils.copyLarge(is, os);
}
}
FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile());
}
private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException

View File

@ -24,7 +24,7 @@ dependencies {
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:process-models:crawling-model')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:converting-process')
implementation project(':third-party:commons-codec')

View File

@ -3,13 +3,13 @@ package nu.marginalia.extractor;
import com.google.inject.Inject;
import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@ -2,13 +2,13 @@ package nu.marginalia.extractor;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
import nu.marginalia.link_parser.FeedExtractor;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@ -5,11 +5,11 @@ import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@ -7,14 +7,16 @@ import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Stream;
public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor;
@ -93,7 +95,7 @@ public class DocumentKeywordExtractor {
var word = rep.word;
if (!word.isBlank()) {
long meta = metadata.getMetadataForWord(rep.stemmed);
byte meta = metadata.getMetadataForWord(rep.stemmed);
wordsBuilder.addMeta(word, meta);
}
}
@ -105,7 +107,13 @@ public class DocumentKeywordExtractor {
{
// we use 1-based indexing since the data
// will be gamma encoded, and it can't represent 0
int pos = 1;
int pos = 0;
List<SpanRecorder> spanRecorders = List.of(
new SpanRecorder(HtmlTag.TITLE),
new SpanRecorder(HtmlTag.HEADING),
new SpanRecorder(HtmlTag.CODE)
);
for (DocumentSentence sent : dld) {
@ -113,6 +121,12 @@ public class DocumentKeywordExtractor {
break;
for (var word : sent) {
pos++;
for (var recorder : spanRecorders) {
recorder.update(sent, pos);
}
if (word.isStopWord()) {
continue;
}
@ -120,7 +134,7 @@ public class DocumentKeywordExtractor {
String w = word.wordLowerCase();
if (matchesWordPattern(w)) {
/* Add information about term positions */
wordsBuilder.addPos(w, pos++);
wordsBuilder.addPos(w, pos);
/* Add metadata for word */
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
@ -130,11 +144,16 @@ public class DocumentKeywordExtractor {
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
long meta = metadata.getMetadataForWord(rep.stemmed);
byte meta = metadata.getMetadataForWord(rep.stemmed);
wordsBuilder.addMeta(rep.word, meta);
}
}
pos++; // we need to add one more position to account for the last word in the document
for (var recorder : spanRecorders) {
wordsBuilder.addSpans(recorder.finish(pos));
}
}
@ -176,4 +195,36 @@ public class DocumentKeywordExtractor {
return false;
}
/** Helper class to record spans of words */
private static class SpanRecorder {
private List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
private final HtmlTag htmlTag;
private int start = 0;
public SpanRecorder(HtmlTag htmlTag) {
this.htmlTag = htmlTag;
}
public void update(DocumentSentence sentence, int pos) {
assert pos > 0;
if (sentence.htmlTags.contains(htmlTag)) {
if (start <= 0) start = pos;
}
else {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = -1;
}
}
}
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
}
return spans;
}
}
}

View File

@ -27,9 +27,9 @@ class KeywordMetadata {
this.urlKeywords = urlKeywords;
}
public long getMetadataForWord(String stemmed) {
public byte getMetadataForWord(String stemmed) {
long flags = 0;
byte flags = 0;
if (subjectLikeKeywords.contains(stemmed)) {
flags |= WordFlags.Subjects.asBit();

View File

@ -1,36 +1,36 @@
package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.sequence.CodedSequence;
import java.io.Serial;
import java.io.Serializable;
import java.util.List;
public final class DocumentKeywords implements Serializable {
public final class DocumentKeywords {
@Serial
private static final long serialVersionUID = 1387282293082091432L;
public final List<String> keywords;
public final byte[] metadata;
public final List<CodedSequence> positions;
public final List<CodedWordSpan> spans;
public final String[] keywords;
public final long[] metadata;
public final CodedSequence[] positions;
public DocumentKeywords(String[] keywords,
long[] metadata,
CodedSequence[] positions)
public DocumentKeywords(List<String> keywords,
byte[] metadata,
List<CodedSequence> positions,
List<CodedWordSpan> spans)
{
this.keywords = keywords;
this.metadata = metadata;
this.positions = positions;
this.spans = spans;
assert keywords.length == metadata.length;
assert keywords.size() == metadata.length;
}
public boolean isEmpty() {
return keywords.length == 0;
return keywords.isEmpty();
}
public int size() {
return keywords.length;
return keywords.size();
}
}

View File

@ -1,11 +1,13 @@
package nu.marginalia.keyword.model;
import gnu.trove.list.array.TByteArrayList;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
import lombok.Getter;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import org.slf4j.Logger;
@ -16,8 +18,9 @@ import java.util.*;
@Getter
public class DocumentKeywordsBuilder {
public final Object2LongLinkedOpenHashMap<String> wordToMeta;
public final Object2ByteOpenHashMap<String> wordToMeta;
public final HashMap<String, IntList> wordToPos;
public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>();
/** These ware keywords that had signals of high relevance */
public final Set<String> importantWords = new HashSet<>();
@ -35,17 +38,17 @@ public class DocumentKeywordsBuilder {
}
public DocumentKeywords build(ByteBuffer workArea) {
final String[] wordArray = new String[wordToMeta.size()];
final long[] meta = new long[wordToMeta.size()];
final CodedSequence[] positions = new CodedSequence[wordToMeta.size()];
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
final List<CodedSequence> positions = new ArrayList<>(wordToMeta.size());
var iter = wordToMeta.object2LongEntrySet().fastIterator();
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
for (int i = 0; iter.hasNext(); i++) {
while (iter.hasNext()) {
var entry = iter.next();
meta[i] = entry.getLongValue();
wordArray[i] = entry.getKey();
meta.add(entry.getByteValue());
wordArray.add(entry.getKey());
var posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
@ -53,18 +56,33 @@ public class DocumentKeywordsBuilder {
posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear();
}
positions[i] = GammaCodedSequence.generate(workArea, posList);
positions.add(GammaCodedSequence.generate(workArea, posList));
}
return new DocumentKeywords(wordArray, meta, positions);
// Encode spans
List<CodedWordSpan> spans = new ArrayList<>(wordSpans.size());
wordSpans.forEach((tag, spansForTag) -> {
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
var positionsForTag = new IntArrayList(spansForTag.size()*2);
for (var span : spansForTag) {
positionsForTag.add(span.start());
positionsForTag.add(span.end());
}
spans.add(new CodedWordSpan((byte) tag.charValue(), GammaCodedSequence.generate(workArea, positionsForTag)));
});
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
}
public DocumentKeywordsBuilder(int capacity) {
wordToMeta = new Object2LongLinkedOpenHashMap<>(capacity);
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
wordToPos = new HashMap<>(capacity);
}
public void addMeta(String word, long meta) {
public void addMeta(String word, byte meta) {
if (word.length() > MAX_WORD_LENGTH)
return;
@ -84,12 +102,12 @@ public class DocumentKeywordsBuilder {
public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) {
flagWords.forEach(word ->
wordToMeta.mergeLong(word, flag.asBit(), (a, b) -> a|b)
wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte)(a|b))
);
}
public void addAllSyntheticTerms(Collection<String> newWords) {
long meta = WordFlags.Synthetic.asBit();
byte meta = WordFlags.Synthetic.asBit();
// Only add the synthetic flag if the words aren't already present
@ -97,17 +115,17 @@ public class DocumentKeywordsBuilder {
}
public void addAnchorTerms(Map<String, Integer> keywords) {
long flagA = WordFlags.ExternalLink.asBit();
long flagB = flagA | WordFlags.Site.asBit();
long flagC = flagB | WordFlags.SiteAdjacent.asBit();
byte flagA = WordFlags.ExternalLink.asBit();
byte flagB = (byte) (flagA | WordFlags.Site.asBit());
byte flagC = (byte) (flagB | WordFlags.SiteAdjacent.asBit());
keywords.forEach((word, count) -> {
if (count > 5) {
wordToMeta.mergeLong(word, flagC, (a, b) -> a|b);
wordToMeta.mergeByte(word, flagC, (a, b) -> (byte) (a|b));
} else if (count > 2) {
wordToMeta.mergeLong(word, flagB, (a, b) -> a|b);
wordToMeta.mergeByte(word, flagB, (a, b) -> (byte) (a|b));
} else {
wordToMeta.mergeLong(word, flagA, (a, b) -> a|b);
wordToMeta.mergeByte(word, flagA, (a, b) -> (byte) (a|b));
}
});
}
@ -115,9 +133,9 @@ public class DocumentKeywordsBuilder {
public List<String> getWordsWithAnyFlag(long flags) {
List<String> ret = new ArrayList<>();
for (var iter = wordToMeta.object2LongEntrySet().fastIterator(); iter.hasNext();) {
for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext();) {
var entry = iter.next();
if ((flags & entry.getLongValue()) != 0) {
if ((flags & entry.getByteValue()) != 0) {
ret.add(entry.getKey());
}
}
@ -125,21 +143,27 @@ public class DocumentKeywordsBuilder {
return ret;
}
public void addSpans(List<DocumentWordSpan> newSpans) {
for (var span : newSpans) {
wordSpans.computeIfAbsent(span.tag().code, k -> new ArrayList<>()).add(span);
}
}
public int size() {
return Math.max(wordToMeta.size(), wordToPos.size());
}
public WordMetadata getMetaForWord(String word) {
return new WordMetadata(wordToMeta.getLong(word));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[ ");
wordToMeta.forEach((word, meta) -> {
sb.append(word).append("->").append(new WordMetadata(meta).flagSet()).append(',').append(wordToPos.getOrDefault(word, new IntArrayList())).append(' ');
sb.append(word).append("->").append(WordFlags.decode(meta)).append(',').append(wordToPos.getOrDefault(word, new IntArrayList())).append(' ');
});
return sb.append(']').toString();
}
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
}
}

View File

@ -4,9 +4,8 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
@ -53,30 +52,11 @@ class DocumentKeywordExtractorTest {
keywords.getWordToMeta().forEach((k, v) -> {
if (k.contains("_")) {
System.out.println(k + " " + new WordMetadata(v));
System.out.println(k + " " + WordFlags.decode(v));
}
});
}
@Test
public void testKeyboards() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
System.out.println(keywords.getMetaForWord("mechanical"));
System.out.println(keywords.getMetaForWord("keyboard"));
System.out.println(keywords.getMetaForWord("keyboards"));
System.out.println(new WordMetadata(8894889328781L));
System.out.println(new WordMetadata(4294967297L));
System.out.println(new WordMetadata(566820053975498886L));
// -
System.out.println(new WordMetadata(1198298103937L));
System.out.println(new WordMetadata(1103808168065L));
}
@Test
public void testMadonna() throws IOException, URISyntaxException {
@ -93,16 +73,17 @@ class DocumentKeywordExtractorTest {
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
Map<String, WordMetadata> flags = new HashMap<>();
Map<String, Byte> flags = new HashMap<>();
Map<String, CodedSequence> positions = new HashMap<>();
for (int i = 0; i < keywordsBuilt.size(); i++) {
String keyword = keywordsBuilt.keywords[i];
long metadata = keywordsBuilt.metadata[i];
String keyword = keywordsBuilt.keywords.get(i);
byte metadata = keywordsBuilt.metadata[i]
;
if (Set.of("dirty", "blues").contains(keyword)) {
flags.put(keyword, new WordMetadata(metadata));
positions.put(keyword, keywordsBuilt.positions[i]);
flags.put(keyword, metadata);
positions.put(keyword, keywordsBuilt.positions.get(i));
}
}
@ -127,7 +108,5 @@ class DocumentKeywordExtractorTest {
new TermFrequencyDict(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));
System.out.println(keywords.getMetaForWord("knitting"));
}
}

View File

@ -1,6 +1,9 @@
package nu.marginalia.api.searchquery;
import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -11,9 +14,6 @@ import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import java.util.ArrayList;
@ -197,7 +197,8 @@ public class QueryProtobufCodec {
return new SearchResultKeywordScore(
keywordScores.getKeyword(),
-1, // termId is internal to index service
keywordScores.getEncodedWordMetadata()
(byte) keywordScores.getFlags(),
keywordScores.getPositions()
);
}

View File

@ -1,40 +1,32 @@
package nu.marginalia.api.searchquery.model.results;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import java.util.Objects;
public final class SearchResultKeywordScore {
public final long termId;
public final String keyword;
private final long encodedWordMetadata;
public byte flags;
public int positionCount;
public SearchResultKeywordScore(String keyword,
long termId,
long encodedWordMetadata) {
byte flags,
int positionCount) {
this.termId = termId;
this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
}
public boolean hasTermFlag(WordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
return (flags & flag.asBit()) != 0;
}
public long positions() {
return WordMetadata.decodePositions(encodedWordMetadata);
}
public boolean isKeywordSpecial() {
return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic);
}
public long encodedWordMetadata() {
return encodedWordMetadata;
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;
@ -51,8 +43,7 @@ public final class SearchResultKeywordScore {
@Override
public String toString() {
return "SearchResultKeywordScore[" +
"keyword=" + keyword + ", " +
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']';
"keyword=" + keyword + ']';
}
}

View File

@ -108,7 +108,8 @@ message RpcRawResultItem {
/* Information about how well a keyword matches a query */
message RpcResultKeywordScore {
string keyword = 1; // the keyword
int64 encodedWordMetadata = 2; // bit encoded word metadata
int32 flags = 2;
int32 positions = 3;
}
/* Query execution parameters */

View File

@ -30,8 +30,9 @@ dependencies {
implementation project(':code:common:linkdb')
implementation project(':code:common:service')
implementation project(':code:functions:search-query:api')
implementation project(':code:processes:converting-process:model')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:index-forward')
implementation project(':code:index:index-reverse')
implementation project(':code:index:query')
@ -73,4 +74,5 @@ dependencies {
testImplementation project(':code:libraries:test-helpers')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:libraries:braille-block-punch-cards')
testImplementation project(':code:libraries:test-helpers')
}

View File

@ -15,11 +15,13 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:slop')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')
implementation project(':code:common:process')
implementation project(':code:processes:converting-process:model')
implementation libs.bundles.slf4j
@ -28,6 +30,7 @@ dependencies {
implementation libs.fastutil
implementation libs.trove
testImplementation project(':code:libraries:test-helpers')
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito

View File

@ -1,19 +1,21 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.array.LongArray;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.slop.column.primitive.LongColumnReader;
import org.roaringbitmap.longlong.LongConsumer;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
@ -23,22 +25,25 @@ public class ForwardIndexConverter {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final IndexJournalReader journalReader;
private final Path outputFileDocsId;
private final Path outputFileDocsData;
private final DomainRankings domainRankings;
private final Path outputFileSpansData;
private final IndexJournal journal;
public ForwardIndexConverter(ProcessHeartbeat heartbeat,
IndexJournalReader journalReader,
Path outputFileDocsId,
Path outputFileDocsData,
Path outputFileSpansData,
IndexJournal journal,
DomainRankings domainRankings
) {
this.heartbeat = heartbeat;
this.journalReader = journalReader;
this.outputFileDocsId = outputFileDocsId;
this.outputFileDocsData = outputFileDocsData;
this.outputFileSpansData = outputFileSpansData;
this.journal = journal;
this.domainRankings = domainRankings;
}
@ -58,7 +63,7 @@ public class ForwardIndexConverter {
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
progress.progress(TaskSteps.GET_DOC_IDS);
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
LongArray docsFileId = getDocIds(outputFileDocsId, journal);
progress.progress(TaskSteps.GATHER_OFFSETS);
@ -73,20 +78,55 @@ public class ForwardIndexConverter {
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
var pointer = journalReader.newPointer();
while (pointer.nextDocument()) {
long docId = pointer.documentId();
int domainId = UrlIdCodec.getDomainId(docId);
ByteBuffer workArea = ByteBuffer.allocate(65536);
for (var instance : journal.pages()) {
try (var docIdReader = instance.openCombinedId();
var metaReader = instance.openDocumentMeta();
var featuresReader = instance.openFeatures();
var sizeReader = instance.openSize();
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId);
var spansCodesReader = instance.openSpanCodes();
var spansSeqReader = instance.openSpans();
var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)
)
{
while (docIdReader.hasRemaining()) {
long docId = docIdReader.get();
int domainId = UrlIdCodec.getDomainId(docId);
int ranking = domainRankings.getRanking(domainId);
long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking);
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId);
long features = pointer.documentFeatures() | ((long) pointer.documentSize() << 32L);
int ranking = domainRankings.getRanking(domainId);
long meta = DocumentMetadata.encodeRank(metaReader.get(), ranking);
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features);
final int docFeatures = featuresReader.get();
final int docSize = sizeReader.get();
long features = docFeatures | ((long) docSize << 32L);
// Write spans data
byte[] spansCodes = spansCodesReader.get();
spansWriter.beginRecord(spansCodes.length);
for (int i = 0; i < spansCodes.length; i++) {
workArea.clear();
spansSeqReader.getData(workArea);
workArea.flip();
spansWriter.writeSpan(spansCodes[i], workArea);
}
long encodedSpansOffset = spansWriter.endRecord();
// Write the principal forward documents file
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features);
docFileData.set(entryOffset + ForwardIndexParameters.SPANS_OFFSET, encodedSpansOffset);
}
}
}
progress.progress(TaskSteps.FORCE);
@ -104,9 +144,16 @@ public class ForwardIndexConverter {
}
}
private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException {
private LongArray getDocIds(Path outputFileDocs, IndexJournal journalReader) throws IOException {
Roaring64Bitmap rbm = new Roaring64Bitmap();
journalReader.forEachDocId(rbm::add);
for (var instance : journalReader.pages()) {
try (LongColumnReader idReader = instance.openCombinedId()) {
while (idReader.hasRemaining()) {
rbm.add(idReader.get());
}
}
}
LongArray ret = LongArrayFactory.mmapForWritingConfined(outputFileDocs, rbm.getIntCardinality());
rbm.forEach(new LongConsumer() {

View File

@ -13,6 +13,10 @@ public class ForwardIndexFileNames {
case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
case CURRENT -> basePath.resolve("fwd-doc-data.dat");
};
case SPANS_DATA -> switch (version) {
case NEXT -> basePath.resolve("fwd-spans.dat.next");
case CURRENT -> basePath.resolve("fwd-spans.dat");
};
};
}
@ -23,6 +27,7 @@ public class ForwardIndexFileNames {
public enum FileIdentifier {
DOC_DATA,
SPANS_DATA,
DOC_ID
}
}

View File

@ -1,8 +1,8 @@
package nu.marginalia.index.forward;
class ForwardIndexParameters {
public static final int ENTRY_SIZE = 2;
public static final int ENTRY_SIZE = 3;
public static final int METADATA_OFFSET = 0;
public static final int FEATURES_OFFSET = 1;
public static final int SPANS_OFFSET = 2;
}

View File

@ -29,19 +29,31 @@ public class ForwardIndexReader {
private final TLongIntHashMap idToOffset;
private final LongArray data;
private final ForwardIndexSpansReader spansReader;
private final Logger logger = LoggerFactory.getLogger(getClass());
public ForwardIndexReader(Path idsFile, Path dataFile) throws IOException {
public ForwardIndexReader(Path idsFile,
Path dataFile,
Path spansFile) throws IOException {
if (!Files.exists(dataFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile);
idToOffset = null;
data = null;
spansReader = null;
return;
}
else if (!Files.exists(idsFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile);
idToOffset = null;
data = null;
spansReader = null;
return;
}
else if (!Files.exists(spansFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile);
idToOffset = null;
data = null;
spansReader = null;
return;
}
@ -49,6 +61,7 @@ public class ForwardIndexReader {
idToOffset = loadIds(idsFile);
data = loadData(dataFile);
spansReader = new ForwardIndexSpansReader(spansFile);
}
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {

View File

@ -0,0 +1,63 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.List;
@SuppressWarnings("preview")
public class ForwardIndexSpansReader implements AutoCloseable {
private final FileChannel spansFileChannel;
public ForwardIndexSpansReader(Path spansFile) throws IOException {
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
}
public List<SpanData> readSpans(Arena arena, long encodedOffset) throws IOException {
long size = encodedOffset & 0xFFF_FFFF;
long offset = encodedOffset >>> 28;
var buffer = arena.allocate(size).asByteBuffer();
buffer.clear();
while (buffer.hasRemaining()) {
spansFileChannel.read(buffer, offset + buffer.position());
}
buffer.flip();
int count = buffer.get();
List<SpanData> ret = new ArrayList<>();
while (count-- > 0) {
byte code = buffer.get();
short len = buffer.getShort();
final int pos = buffer.position();
// Decode the gamma-coded sequence; this will advance the buffer position
// in a not entirely predictable way, so we need to save the position
buffer.limit(buffer.position() + len);
var sequence = new GammaCodedSequence(buffer).values();
ret.add(new SpanData(code, sequence));
// Reset the buffer position to the end of the span
buffer.position(pos + len);
buffer.limit(buffer.capacity());
}
return ret;
}
@Override
public void close() throws IOException {
spansFileChannel.close();
}
public record SpanData(byte code, IntList data) {}
}

View File

@ -0,0 +1,53 @@
package nu.marginalia.index.forward;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class ForwardIndexSpansWriter implements AutoCloseable {
private final FileChannel outputChannel;
private final ByteBuffer work = ByteBuffer.allocate(32);
private long stateStartOffset = -1;
private int stateLength = -1;
public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException {
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
}
public void beginRecord(int count) throws IOException {
stateStartOffset = outputChannel.position();
stateLength = 0;
work.clear();
work.put((byte) count);
work.flip();
while (work.hasRemaining())
stateLength += outputChannel.write(work);
}
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
work.clear();
work.put(spanCode);
work.putShort((short) sequenceData.remaining());
work.flip();
while (work.hasRemaining() || sequenceData.hasRemaining()) {
stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData});
}
}
public long endRecord() {
return stateStartOffset << 28 | stateLength;
}
@Override
public void close() throws IOException {
outputChannel.close();
}
}

View File

@ -2,15 +2,11 @@ package nu.marginalia.index.forward;
import lombok.SneakyThrows;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
@ -21,85 +17,94 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.stream.IntStream;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
class ForwardIndexConverterTest {
IndexJournalWriter writer;
IndexJournalSlopWriter writer;
Path indexFile;
Path wordsFile1;
Path urlsFile1;
Path dictionaryFile;
Path workDir;
private final Logger logger = LoggerFactory.getLogger(getClass());
Path dataDir;
private Path docsFileId;
private Path docsFileData;
private Path docsSpanData;
int workSetSize = 512;
@BeforeEach
@SneakyThrows
void setUp() {
workDir = Files.createTempDirectory(getClass().getSimpleName());
dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit();
indexFile = Files.createTempFile("tmp", ".idx");
indexFile.toFile().deleteOnExit();
writer = new IndexJournalWriterSingleFileImpl(indexFile);
wordsFile1 = Files.createTempFile("words1", ".idx");
urlsFile1 = Files.createTempFile("urls1", ".idx");
dataDir = Files.createTempDirectory(getClass().getSimpleName());
for (int i = 1; i < workSetSize; i++) {
createEntry(writer, i);
try (var writer = new IndexJournalSlopWriter(IndexJournal.allocateName(workDir), 0)) {
for (int i = 1; i < workSetSize; i++) {
createEntry(writer, i);
}
}
writer.close();
docsFileId = dataDir.resolve("docs-i.dat");
docsFileData = dataDir.resolve("docs-d.dat");
docsSpanData = dataDir.resolve("docs-s.dat");
}
@AfterEach
public void tearDown() {
TestUtil.clearTempDir(dataDir);
TestUtil.clearTempDir(workDir);
}
long createId(long url, long domain) {
return UrlIdCodec.encodeId((int) domain, (int) url);
}
public void createEntry(IndexJournalWriter writer, int id) {
public void createEntry(IndexJournalSlopWriter writer, int id) {
writer.put(
new IndexJournalEntryHeader(createId(id, id/20),
createId(id, id/20),
new SlopDocumentRecord.KeywordsProjection(
"",
-1,
id%3,
id%5,
15,
(id % 5)),
new IndexJournalEntryData(
new String[]{},
new long[]{},
new CodedSequence[]{}
List.of(),
new byte[0],
List.of(),
new byte[0],
List.of()
)
);
}
@Test
void testForwardIndex() throws IOException {
new ForwardIndexConverter(new FakeProcessHeartbeat(),
new IndexJournalReaderSingleFile(indexFile),
docsFileId,
docsFileData,
docsSpanData,
IndexJournal.findJournal(workDir).orElseThrow(),
new DomainRankings()).convert();
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData, docsSpanData);
for (int i = 36; i < workSetSize; i++) {
long docId = createId(i, i/20);
@ -108,5 +113,4 @@ class ForwardIndexConverterTest {
assertEquals(i/20, UrlIdCodec.getDomainId(docId));
}
}
}

View File

@ -0,0 +1,63 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.assertEquals;
class ForwardIndexSpansReaderTest {
Path testFile = Files.createTempFile("test", ".idx");
ForwardIndexSpansReaderTest() throws IOException {
}
@AfterEach
public void tearDown() throws IOException {
Files.deleteIfExists(testFile);
}
@Test
void testSunnyDay() throws IOException {
ByteBuffer wa = ByteBuffer.allocate(32);
long offset1;
long offset2;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan((byte) 'a', GammaCodedSequence.generate(wa, 1, 3, 5).buffer());
offset1 = writer.endRecord();
writer.beginRecord(2);
writer.writeSpan((byte) 'b', GammaCodedSequence.generate(wa, 2, 4, 6).buffer());
writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 3, 5, 7).buffer());
offset2 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);
var spans2 = reader.readSpans(arena, offset2);
assertEquals(1, spans1.size());
assertEquals('a', spans1.get(0).code());
assertEquals(IntList.of(1, 3, 5), spans1.get(0).data());
assertEquals(2, spans2.size());
assertEquals('b', spans2.get(0).code());
assertEquals(IntList.of(2, 4, 6), spans2.get(0).data());
assertEquals('c', spans2.get(1).code());
assertEquals(IntList.of(3, 5, 7), spans2.get(1).data());
}
}
}

View File

@ -1,43 +0,0 @@
package nu.marginalia.test;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
public class TestUtil {
public static void clearTempDir(Path dir) {
if (Files.isDirectory(dir)) {
for (File f : dir.toFile().listFiles()) {
File[] files = f.listFiles();
if (files != null) {
Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
}
System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")");
f.delete();
}
}
System.out.println("Deleting " + dir);
dir.toFile().delete();
}
private static String fileSize(Path path) {
try {
long sizeBytes = Files.size(path);
if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
return sizeBytes + "b";
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
private static String round(double d) {
return String.format("%.2f", d);
}
}

View File

@ -15,7 +15,9 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:array')
implementation project(':code:libraries:slop')
implementation project(':code:common:model')
implementation project(':code:processes:converting-process:model')
implementation project(':third-party:parquet-floor')
implementation project(':third-party:commons-codec')

View File

@ -0,0 +1,53 @@
package nu.marginalia.index.journal;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
public record IndexJournal(Path journalDir) {
public static final String JOURNAL_FILE_NAME = "index-journal";
public static Path allocateName(Path base) {
return base.resolve(JOURNAL_FILE_NAME);
}
/** Returns the journal file in the base directory. */
public static Optional<IndexJournal> findJournal(Path baseDirectory) {
Path journal = baseDirectory.resolve(JOURNAL_FILE_NAME);
if (Files.isDirectory(journal)) {
return Optional.of(new IndexJournal(journal));
}
return Optional.empty();
}
/** Returns the number of versions of the journal file in the base directory. */
public static int numPages(Path baseDirectory) {
for (int version = 0; ; version++) {
if (!IndexJournalPage.combinedId.forPage(version).exists(baseDirectory)) {
return version;
}
}
}
public IndexJournal {
if (!journalDir.toFile().isDirectory()) {
throw new IllegalArgumentException("Invalid journal directory: " + journalDir);
}
}
public List<IndexJournalPage> pages() {
int pages = numPages(journalDir);
List<IndexJournalPage> instances = new ArrayList<>(pages);
for (int version = 0; version < pages; version++) {
instances.add(new IndexJournalPage(journalDir, version));
}
return instances;
}
}

View File

@ -1,30 +0,0 @@
package nu.marginalia.index.journal;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public class IndexJournalFileNames {
public static Path allocateName(Path base, int idx) {
return base.resolve(String.format("page-index-%04d.dat", idx));
}
public static List<Path> findJournalFiles(Path baseDirectory) throws IOException {
List<Path> ret = new ArrayList<>();
try (var listStream = Files.list(baseDirectory)) {
listStream
.filter(IndexJournalFileNames::isJournalFile)
.sorted()
.forEach(ret::add);
}
return ret;
}
public static boolean isJournalFile(Path file) {
return file.toFile().getName().matches("page-index-\\d{4}.dat");
}
}

View File

@ -0,0 +1,76 @@
package nu.marginalia.index.journal;
import nu.marginalia.slop.column.array.ByteArrayColumnReader;
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
import nu.marginalia.slop.column.dynamic.GammaCodedSequenceReader;
import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter;
import nu.marginalia.slop.column.dynamic.VarintColumnReader;
import nu.marginalia.slop.column.dynamic.VarintColumnWriter;
import nu.marginalia.slop.column.primitive.*;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.ColumnType;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
import java.nio.file.Path;
public record IndexJournalPage(Path baseDir, int page) {
public static final ColumnDesc<IntColumnReader, IntColumnWriter> features = new ColumnDesc<>("features", ColumnType.INT_LE, StorageType.PLAIN);
public static final ColumnDesc<IntColumnReader, IntColumnWriter> size = new ColumnDesc<>("size", ColumnType.INT_LE, StorageType.PLAIN);
public static final ColumnDesc<LongColumnReader, LongColumnWriter> combinedId = new ColumnDesc<>("combinedId", ColumnType.LONG_LE, StorageType.PLAIN);
public static final ColumnDesc<LongColumnReader, LongColumnWriter> documentMeta = new ColumnDesc<>("documentMeta", ColumnType.LONG_LE, StorageType.PLAIN);
public static final ColumnDesc<VarintColumnReader, VarintColumnWriter> termCounts = new ColumnDesc<>("termCounts", ColumnType.VARINT_LE, StorageType.PLAIN);
public static final ColumnDesc<LongColumnReader, LongColumnWriter> termIds = new ColumnDesc<>("termIds", ColumnType.LONG_LE, StorageType.ZSTD);
public static final ColumnDesc<ByteColumnReader, ByteColumnWriter> termMeta = new ColumnDesc<>("termMetadata", ColumnType.BYTE, StorageType.ZSTD);
public static final ColumnDesc<GammaCodedSequenceReader, GammaCodedSequenceWriter> positions = new ColumnDesc<>("termPositions", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD);
public static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> spanCodes = new ColumnDesc<>("spanCodes", ColumnType.BYTE_ARRAY, StorageType.ZSTD);
public static final ColumnDesc<GammaCodedSequenceReader, GammaCodedSequenceWriter> spans = new ColumnDesc<>("spans", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD);
public IndexJournalPage {
if (!baseDir.toFile().isDirectory()) {
throw new IllegalArgumentException("Invalid base directory: " + baseDir);
}
}
public LongColumnReader openCombinedId() throws IOException {
return combinedId.forPage(page).open(baseDir);
}
public LongColumnReader openDocumentMeta() throws IOException {
return documentMeta.forPage(page).open(baseDir);
}
public IntColumnReader openFeatures() throws IOException {
return features.forPage(page).open(baseDir);
}
public IntColumnReader openSize() throws IOException {
return size.forPage(page).open(baseDir);
}
public LongColumnReader openTermCounts() throws IOException {
return termCounts.forPage(page).open(baseDir);
}
public LongColumnReader openTermIds() throws IOException {
return termIds.forPage(page).open(baseDir);
}
public ByteColumnReader openTermMetadata() throws IOException {
return termMeta.forPage(page).open(baseDir);
}
public GammaCodedSequenceReader openTermPositions() throws IOException {
return positions.forPage(page).open(baseDir);
}
public GammaCodedSequenceReader openSpans() throws IOException {
return spans.forPage(page).open(baseDir);
}
public ByteArrayColumnReader openSpanCodes() throws IOException {
return spanCodes.forPage(page).open(baseDir);
}
}

View File

@ -0,0 +1,105 @@
package nu.marginalia.index.journal;
import lombok.SneakyThrows;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter;
import nu.marginalia.slop.column.primitive.ByteColumnWriter;
import nu.marginalia.slop.column.primitive.IntColumnWriter;
import nu.marginalia.slop.column.primitive.LongColumnWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
public class IndexJournalSlopWriter implements AutoCloseable {
private final IntColumnWriter featuresWriter;
private final IntColumnWriter sizeWriter;
private final LongColumnWriter combinedIdWriter;
private final LongColumnWriter documentMetaWriter;
private final LongColumnWriter termCountsWriter;
private final LongColumnWriter termIdsWriter;
private final ByteColumnWriter termMetadataWriter;
private final GammaCodedSequenceWriter termPositionsWriter;
private final GammaCodedSequenceWriter spansWriter;
private final ByteArrayColumnWriter spanCodesWriter;
private static final MurmurHash3_128 hash = new MurmurHash3_128();
public IndexJournalSlopWriter(Path dir, int page) throws IOException {
if (!Files.exists(dir)) {
Files.createDirectory(dir);
}
featuresWriter = IndexJournalPage.features.forPage(page).create(dir);
sizeWriter = IndexJournalPage.size.forPage(page).create(dir);
combinedIdWriter = IndexJournalPage.combinedId.forPage(page).create(dir);
documentMetaWriter = IndexJournalPage.documentMeta.forPage(page).create(dir);
termCountsWriter = IndexJournalPage.termCounts.forPage(page).create(dir);
termIdsWriter = IndexJournalPage.termIds.forPage(page).create(dir);
termMetadataWriter = IndexJournalPage.termMeta.forPage(page).create(dir);
termPositionsWriter = IndexJournalPage.positions.forPage(page).create(dir);
spansWriter = IndexJournalPage.spans.forPage(page).create(dir);
spanCodesWriter = IndexJournalPage.spanCodes.forPage(page).create(dir);
}
@SneakyThrows
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) {
combinedIdWriter.put(combinedId);
featuresWriter.put(keywordsProjection.htmlFeatures());
sizeWriter.put(keywordsProjection.length());
documentMetaWriter.put(keywordsProjection.documentMetadata());
// -- write keyword data --
final List<String> keywords = keywordsProjection.words();
byte[] termMetadata = keywordsProjection.metas();
termCountsWriter.put(keywords.size());
// termIds are the special hashes of the keywords
long[] termIds = new long[keywordsProjection.words().size()];
for (int i = 0; i < termIds.length; i++) {
termIds[i] = hash.hashKeyword(keywords.get(i));
}
List<CodedSequence> termPositions = keywordsProjection.positions();
for (int i = 0; i < termMetadata.length; i++) {
termMetadataWriter.put(termMetadata[i]);
termIdsWriter.put(termIds[i]);
termPositionsWriter.put((GammaCodedSequence) termPositions.get(i));
}
// -- write spans --
spanCodesWriter.put(keywordsProjection.spanCodes());
for (var span : keywordsProjection.spans()) {
spansWriter.put((GammaCodedSequence) span);
}
}
public void close() throws IOException {
featuresWriter.close();
sizeWriter.close();
combinedIdWriter.close();
documentMetaWriter.close();
termCountsWriter.close();
termIdsWriter.close();
termMetadataWriter.close();
termPositionsWriter.close();
spansWriter.close();
spanCodesWriter.close();
}
}

View File

@ -1,36 +0,0 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.sequence.CodedSequence;
public record IndexJournalEntryData(long[] termIds,
long[] metadata,
CodedSequence[] positions) {
public IndexJournalEntryData {
assert termIds.length == metadata.length;
assert termIds.length == positions.length;
}
public IndexJournalEntryData(String[] keywords,
long[] metadata,
CodedSequence[] positions)
{
this(termIds(keywords), metadata, positions);
}
private static final MurmurHash3_128 hash = new MurmurHash3_128();
public int size() {
return termIds.length;
}
private static long[] termIds(String[] keywords) {
long[] termIds = new long[keywords.length];
for (int i = 0; i < keywords.length; i++) {
termIds[i] = hash.hashKeyword(keywords[i]);
}
return termIds;
}
}

View File

@ -1,35 +0,0 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
/** The header of an index journal entry.
*
* @param entrySize the size of the entry
* @param documentFeatures the features of the document, as an encoded HtmlFeature
* @param combinedId the combined document id, encoded with UrlIdCodec
* @param documentMeta the metadata of the document, as an encoded DocumentMetadata
*
* @see DocumentMetadata
* @see HtmlFeature
* @see UrlIdCodec
*/
public record IndexJournalEntryHeader(int entrySize,
int documentFeatures,
int documentSize,
long combinedId,
long documentMeta) {
public IndexJournalEntryHeader(long combinedId,
int documentFeatures,
int documentSize,
long documentMeta) {
this(-1,
documentFeatures,
documentSize,
combinedId,
documentMeta);
}
}

View File

@ -1,25 +0,0 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import java.nio.ByteBuffer;
/** Data corresponding to a term in a document in the index journal.
*
* @param termId the id of the term
* @param metadata the metadata of the term
* @param positionsBuffer buffer holding positions of the word in the document, gamma coded
*
* @see GammaCodedSequence
*/
public record IndexJournalEntryTermData(
long termId,
long metadata,
ByteBuffer positionsBuffer)
{
public CodedSequence positions() {
return new GammaCodedSequence(positionsBuffer);
}
}

View File

@ -1,10 +0,0 @@
package nu.marginalia.index.journal.model;
/** The header of an index journal file. This is the first 16 bytes of the file,
* and is not compressed.
*
* @param fileSizeRecords the size of the file in number of records
* @param reserved should be 0
*/
public record IndexJournalFileHeader(long fileSizeRecords, long reserved) {
}

View File

@ -1,111 +0,0 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.model.id.UrlIdCodec;
import java.io.DataInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Iterator;
public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData> {
public final IndexJournalEntryHeader header;
private final ByteBuffer buffer;
private final int initialPos;
public IndexJournalReadEntry(IndexJournalEntryHeader header, ByteBuffer buffer) {
this.header = header;
this.buffer = buffer;
this.initialPos = buffer.position();
}
public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException {
final int entrySize = (inputStream.readShort() & 0xFFFF);
final int docSize = inputStream.readShort();
final int docFeatures = inputStream.readInt();
final long docId = inputStream.readLong();
final long meta = inputStream.readLong();
var header = new IndexJournalEntryHeader(
entrySize,
docFeatures,
docSize,
docId,
meta);
byte[] buffer = new byte[entrySize];
inputStream.readFully(buffer);
return new IndexJournalReadEntry(header, ByteBuffer.wrap(buffer));
}
public long docId() {
return header.combinedId();
}
public long docMeta() {
return header.documentMeta();
}
public int documentFeatures() {
return header.documentFeatures();
}
public int documentSize() {
return header.documentSize();
}
public int domainId() {
return UrlIdCodec.getDomainId(docId());
}
public void reset() {
buffer.position(initialPos);
}
public Iterator<IndexJournalEntryTermData> iterator() {
return new TermDataIterator(buffer, initialPos);
}
}
class TermDataIterator implements Iterator<IndexJournalEntryTermData> {
private final ByteBuffer buffer;
// Pointer alias to buffer, used to reduce slice() allocation overhead in the iterator
private final ByteBuffer alias;
TermDataIterator(ByteBuffer buffer, int initialPos) {
this.buffer = buffer;
this.buffer.position(initialPos);
this.alias = buffer.duplicate();
}
@Override
public boolean hasNext() {
return buffer.position() < buffer.limit();
}
@Override
public IndexJournalEntryTermData next() {
// read the metadata for the term
long termId = buffer.getLong();
long meta = buffer.getShort();
// read the size of the sequence data
int size = buffer.getShort() & 0xFFFF;
// position the alias buffer to the term data
alias.limit(buffer.position() + size);
alias.position(buffer.position());
// advance the buffer position to the next term
buffer.position(buffer.position() + size);
return new IndexJournalEntryTermData(termId, meta, alias);
}
}

View File

@ -1,73 +0,0 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import java.io.IOException;
import java.nio.file.Path;
import java.util.function.LongConsumer;
import java.util.function.LongPredicate;
/** Tools for reading the index journal. */
public interface IndexJournalReader {
int FILE_HEADER_SIZE_LONGS = 2;
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
int DOCUMENT_HEADER_SIZE_BYTES = 24;
int TERM_HEADER_SIZE_BYTES = 12;
/** Create a reader for a single file. */
static IndexJournalReader singleFile(Path fileName) throws IOException {
return new IndexJournalReaderSingleFile(fileName);
}
/** Create a reader for a set of files. */
static IndexJournalReader paging(Path baseDir) throws IOException {
return new IndexJournalReaderPagingImpl(baseDir);
}
default void forEachWordId(LongConsumer consumer) {
var ptr = this.newPointer();
while (ptr.nextDocument()) {
for (var termData : ptr) {
consumer.accept(termData.termId());
}
}
}
default void forEachDocId(LongConsumer consumer) throws IOException {
try (var ptr = this.newPointer()) {
while (ptr.nextDocument()) {
consumer.accept(ptr.documentId());
}
}
}
/** Create a new pointer to the journal. The IndexJournalPointer is
* a two-tiered iterator that allows both iteration over document records
* and the terms within each document.
*/
IndexJournalPointer newPointer();
/** Reader that filters the entries based on the term metadata. */
default IndexJournalReader filtering(LongPredicate termMetaFilter) {
return new FilteringIndexJournalReader(this, termMetaFilter);
}
}
class FilteringIndexJournalReader implements IndexJournalReader {
private final IndexJournalReader base;
private final LongPredicate termMetaFilter;
FilteringIndexJournalReader(IndexJournalReader base, LongPredicate termMetaFilter) {
this.base = base;
this.termMetaFilter = termMetaFilter;
}
@Override
public IndexJournalPointer newPointer() {
return base
.newPointer()
.filterWordMeta(termMetaFilter);
}
}

View File

@ -1,43 +0,0 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public class IndexJournalReaderPagingImpl implements IndexJournalReader {
private static final Logger logger = LoggerFactory.getLogger(IndexJournalReaderPagingImpl.class);
private final List<IndexJournalReader> readers;
public IndexJournalReaderPagingImpl(Path baseDir) throws IOException {
this(IndexJournalFileNames.findJournalFiles(baseDir));
if (readers.isEmpty())
logger.warn("Creating paging index journal file in {}, found no inputs!", baseDir);
else
logger.info("Creating paging index journal reader for {} inputs", readers.size());
}
public IndexJournalReaderPagingImpl(List<Path> inputFiles) throws IOException {
this.readers = new ArrayList<>(inputFiles.size());
for (var inputFile : inputFiles) {
readers.add(new IndexJournalReaderSingleFile(inputFile));
}
}
@Override
public IndexJournalPointer newPointer() {
return IndexJournalPointer.concatenate(
readers.stream()
.map(IndexJournalReader::newPointer)
.toArray(IndexJournalPointer[]::new)
);
}
}

View File

@ -1,116 +0,0 @@
package nu.marginalia.index.journal.reader;
import com.github.luben.zstd.ZstdInputStream;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.index.journal.model.IndexJournalFileHeader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import org.jetbrains.annotations.NotNull;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Iterator;
public class IndexJournalReaderSingleFile implements IndexJournalReader {
private final Path journalFile;
public final IndexJournalFileHeader fileHeader;
@Override
public String toString() {
return "IndexJournalReaderSingleCompressedFile{" + journalFile + " }";
}
public IndexJournalReaderSingleFile(Path file) throws IOException {
this.journalFile = file;
fileHeader = readHeader(file);
}
private static IndexJournalFileHeader readHeader(Path file) throws IOException {
try (var raf = new RandomAccessFile(file.toFile(), "r")) {
long recordCount = raf.readLong();
long unused = raf.readLong();
return new IndexJournalFileHeader(recordCount, unused);
}
}
private static DataInputStream createInputStream(Path file) throws IOException {
var fileInputStream = Files.newInputStream(file, StandardOpenOption.READ);
// skip the header
fileInputStream.skipNBytes(16);
return new DataInputStream(new ZstdInputStream(new BufferedInputStream(fileInputStream)));
}
@SneakyThrows
@Override
public IndexJournalPointer newPointer() {
return new SingleFileJournalPointer(fileHeader, createInputStream(journalFile));
}
}
class SingleFileJournalPointer implements IndexJournalPointer {
private final IndexJournalFileHeader fileHeader;
private final DataInputStream dataInputStream;
private IndexJournalReadEntry entry;
private int docIdx = -1;
public SingleFileJournalPointer(
IndexJournalFileHeader fileHeader,
DataInputStream dataInputStream)
{
this.fileHeader = fileHeader;
this.dataInputStream = dataInputStream;
}
@SneakyThrows
@Override
public boolean nextDocument() {
if (++docIdx < fileHeader.fileSizeRecords()) {
entry = IndexJournalReadEntry.read(dataInputStream);
return true;
}
dataInputStream.close();
return false;
}
@Override
public long documentId() {
return entry.docId();
}
@Override
public long documentMeta() {
return entry.docMeta();
}
@Override
public int documentFeatures() { return entry.documentFeatures(); }
@Override
public int documentSize() { return entry.documentSize(); }
/** Return an iterator over the terms in the current document.
* This iterator is not valid after calling nextDocument().
*/
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
return entry.iterator();
}
@Override
public void close() throws IOException {
dataInputStream.close();
}
}

View File

@ -1,202 +0,0 @@
package nu.marginalia.index.journal.reader.pointer;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.util.Iterator;
import java.util.function.LongPredicate;
/**
* This is something like a double iterator. The Index Journal consists of
* blocks of words and word-metadata for each document and document metadata.
* <br>
*
* Perhaps best conceptualized as something like
*
* <pre>[doc1: word1 word2 word3 word4] [doc2: word1 word2 word3 ]</pre>
* nextDocument() will move the pointer from doc1 to doc2;<br>
* nextRecord() will move the pointer from word1 to word2...<br>
*/
public interface IndexJournalPointer extends Iterable<IndexJournalEntryTermData>, AutoCloseable {
/**
* Advance to the next document in the journal,
* returning true if such a document exists.
* Resets the record index to before the first
* record (if it exists).
*/
boolean nextDocument();
/**
* Get the id associated with the current document
*/
long documentId();
/**
* Get the metadata associated with the current document
*/
long documentMeta();
/**
* Get the documentFeatures associated with the current record
*/
int documentFeatures();
int documentSize();
/** Concatenate a number of journal pointers */
static IndexJournalPointer concatenate(IndexJournalPointer... pointers) {
if (pointers.length == 1)
return pointers[0];
return new JoiningJournalPointer(pointers);
}
/** Add a filter on word metadata to the pointer */
default IndexJournalPointer filterWordMeta(LongPredicate filter) {
return new FilteringJournalPointer(this, filter);
}
void close() throws IOException;
}
class JoiningJournalPointer implements IndexJournalPointer {
private final IndexJournalPointer[] pointers;
private int pIndex = 0;
JoiningJournalPointer(IndexJournalPointer[] pointers) {
this.pointers = pointers;
}
@Override
public boolean nextDocument() {
while (pIndex < pointers.length) {
if (pointers[pIndex].nextDocument())
return true;
else pIndex++;
}
return false;
}
@Override
public long documentId() {
return pointers[pIndex].documentId();
}
@Override
public long documentMeta() {
return pointers[pIndex].documentMeta();
}
@Override
public int documentFeatures() {
return pointers[pIndex].documentFeatures();
}
@Override
public int documentSize() {
return pointers[pIndex].documentSize();
}
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
return pointers[pIndex].iterator();
}
public void close() {
for (var p : pointers) {
try {
p.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
class FilteringJournalPointer implements IndexJournalPointer {
private final IndexJournalPointer base;
private final LongPredicate filter;
FilteringJournalPointer(IndexJournalPointer base, LongPredicate filter) {
this.base = base;
this.filter = filter;
}
@Override
public boolean nextDocument() {
while (base.nextDocument()) {
if (iterator().hasNext()) {
return true;
}
}
return false;
}
@Override
public long documentId() {
return base.documentId();
}
@Override
public long documentMeta() {
return base.documentMeta();
}
@Override
public int documentFeatures() {
return base.documentFeatures();
}
@Override
public int documentSize() {
return base.documentSize();
}
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
return new Iterator<>() {
private final Iterator<IndexJournalEntryTermData> baseIter = base.iterator();
private IndexJournalEntryTermData value = null;
@Override
public boolean hasNext() {
if (value != null) {
return true;
}
while (baseIter.hasNext()) {
value = baseIter.next();
if (filter.test(value.metadata())) {
return true;
}
}
value = null;
return false;
}
@Override
public IndexJournalEntryTermData next() {
if (hasNext()) {
var ret = value;
value = null;
return ret;
} else {
throw new IllegalStateException("No more elements");
}
}
};
}
@Override
public void close() throws IOException {
base.close();
}
}

View File

@ -1,17 +0,0 @@
package nu.marginalia.index.journal.writer;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import java.io.IOException;
/** Responsible for writing to the index journal.
* <p></p>
* @see IndexJournalWriterSingleFileImpl
* @see IndexJournalWriterPagingImpl
*/
public interface IndexJournalWriter extends AutoCloseable {
void close() throws IOException;
int put(IndexJournalEntryHeader header, IndexJournalEntryData data);
}

View File

@ -1,68 +0,0 @@
package nu.marginalia.index.journal.writer;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
/** IndexJournalWriter implementation that creates a sequence of journal files,
* delegating to IndexJournalWriterSingleFileImpl to write the individual files.
*
*/
public class IndexJournalWriterPagingImpl implements IndexJournalWriter {
private final Path outputDir;
private int fileNumber = 0;
/** The maximum size of a journal file, in uncompressed bytes.
* This should be safely below 2 GB, since we assume in the construction
* of the index that this is the case! The smaller these files are, the
* slower the index construction will be, but at the same time, if 2 GB
* is exceeded, the index construction will *quietly* fail.
*
* Flap flap, Icarus!
*/
private static final long sizeLimitBytes = 1_000_000_000; // 1 GB
private final Logger logger = LoggerFactory.getLogger(getClass());
private IndexJournalWriter currentWriter = null;
private long bytesWritten = 0;
public IndexJournalWriterPagingImpl(Path outputDir) throws IOException {
this.outputDir = outputDir;
switchToNextWriter();
logger.info("Creating Journal Writer {}", outputDir);
}
private void switchToNextWriter() throws IOException {
if (currentWriter != null)
currentWriter.close();
currentWriter = new IndexJournalWriterSingleFileImpl(IndexJournalFileNames.allocateName(outputDir, fileNumber++));
}
@Override
@SneakyThrows
public int put(IndexJournalEntryHeader header, IndexJournalEntryData data)
{
if (bytesWritten >= sizeLimitBytes) {
bytesWritten = 0;
switchToNextWriter();
}
int writtenNow = currentWriter.put(header, data);
bytesWritten += writtenNow;
return writtenNow;
}
public void close() throws IOException {
currentWriter.close();
}
}

View File

@ -1,155 +0,0 @@
package nu.marginalia.index.journal.writer;
import com.github.luben.zstd.ZstdDirectBufferCompressingStream;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
/** IndexJournalWriter implementation that creates a single journal file */
public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
private static final int ZSTD_BUFFER_SIZE = 1<<16;
private static final int DATA_BUFFER_SIZE = 1<<16;
private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE);
private final ZstdDirectBufferCompressingStream compressingStream;
private final FileChannel fileChannel;
private int numEntries = 0;
private boolean closed = false;
private final Logger logger = LoggerFactory.getLogger(getClass());
public IndexJournalWriterSingleFileImpl(Path outputFile) throws IOException {
logger.info("Creating Journal Writer {}", outputFile);
Files.deleteIfExists(outputFile);
Files.createFile(outputFile, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
fileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE,
StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
writeHeaderPlaceholder(fileChannel);
compressingStream = new ZstdDirectBufferCompressingStream(ByteBuffer.allocateDirect(ZSTD_BUFFER_SIZE), 3) {
protected ByteBuffer flushBuffer(ByteBuffer toFlush) throws IOException {
toFlush.flip();
while (toFlush.hasRemaining()) {
fileChannel.write(toFlush);
}
toFlush.clear();
return toFlush;
}
};
}
/** The file has a non-compressed header at the beginning of the file.
* Write a placeholder first to reserve the bytes, and position the
* channel after the header
*/
private static void writeHeaderPlaceholder(FileChannel fileStream) throws IOException {
var buffer = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES);
buffer.position(0);
buffer.limit(buffer.capacity());
while (buffer.hasRemaining())
fileStream.write(buffer, buffer.position());
fileStream.position(IndexJournalReader.FILE_HEADER_SIZE_BYTES);
}
@Override
@SneakyThrows
public int put(IndexJournalEntryHeader header,
IndexJournalEntryData data)
{
final long[] keywords = data.termIds();
final long[] metadata = data.metadata();
final CodedSequence[] positions = data.positions();
int entrySize = 0;
for (var position : positions) {
entrySize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + position.bufferSize();
}
int totalSize = IndexJournalReader.DOCUMENT_HEADER_SIZE_BYTES + entrySize;
if (entrySize > DATA_BUFFER_SIZE) {
// This should never happen, but if it does, we should log it and deal with it in a way that doesn't corrupt the file
// (64 KB is *a lot* of data for a single document, larger than the uncompressed HTML in like the 95%th percentile of web pages)
logger.error("Omitting entry: Record size {} exceeds maximum representable size of {}", entrySize, DATA_BUFFER_SIZE);
return 0;
}
if (dataBuffer.remaining() < totalSize) {
dataBuffer.flip();
compressingStream.compress(dataBuffer);
dataBuffer.clear();
}
if (dataBuffer.remaining() < totalSize) {
logger.error("Omitting entry: Record size {} exceeds buffer size of {}", totalSize, dataBuffer.capacity());
return 0;
}
assert entrySize < (1 << 16) : "Entry size must not exceed USHORT_MAX";
dataBuffer.putShort((short) entrySize);
dataBuffer.putShort((short) Math.clamp(header.documentSize(), 0, Short.MAX_VALUE));
dataBuffer.putInt(header.documentFeatures());
dataBuffer.putLong(header.combinedId());
dataBuffer.putLong(header.documentMeta());
for (int i = 0; i < keywords.length; i++) {
dataBuffer.putLong(keywords[i]);
dataBuffer.putShort((short) metadata[i]);
dataBuffer.putShort((short) positions[i].bufferSize());
dataBuffer.put(positions[i].buffer());
}
numEntries++;
return totalSize;
}
public void close() throws IOException {
if (closed)
return;
else
closed = true;
dataBuffer.flip();
compressingStream.compress(dataBuffer);
dataBuffer.clear();
compressingStream.flush();
compressingStream.close();
// Finalize the file by writing a header in the beginning
ByteBuffer header = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES);
header.putLong(numEntries);
header.putLong(0); // reserved for future use
header.flip();
while (header.position() < header.limit()) {
fileChannel.write(header, header.position());
}
fileChannel.close();
}
}

View File

@ -1,448 +0,0 @@
package nu.marginalia.index.journal;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static org.junit.jupiter.api.Assertions.*;
public class IndexJournalWriterTest {
Path tempFile;
Path tempFile2;
ByteBuffer workArea = ByteBuffer.allocate(1024);
@BeforeEach
public void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
tempFile2 = Files.createTempFile(getClass().getSimpleName(), ".dat");
}
@AfterEach
public void tearDown() throws IOException {
Files.delete(tempFile);
Files.delete(tempFile2);
}
private GammaCodedSequence gcs(int... values) {
return GammaCodedSequence.generate(workArea, values);
}
static MurmurHash3_128 hasher = new MurmurHash3_128();
static long wordId(String str) {
return hasher.hashKeyword(str);
}
@Test
public void testSingleFile() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
new GammaCodedSequence[]{
gcs(2, 4, 6),
gcs(3, 5, 7),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderSingleFile(tempFile);
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 1 */
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(33, ptr.documentMeta());
assertEquals(10, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(55, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
/** DOCUMENT 2 */
assertTrue(ptr.nextDocument());
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
assertEquals(11, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(45, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(56, termData.metadata());
assertEquals(IntList.of(3, 5, 7), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
// No more documents
assertFalse(ptr.nextDocument());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testMultiFile() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) {
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
new GammaCodedSequence[]{
gcs(2, 4, 6),
gcs(3, 5, 7),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderPagingImpl(List.of(tempFile, tempFile2));
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 1 */
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(33, ptr.documentMeta());
assertEquals(10, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(55, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
/** DOCUMENT 2 */
assertTrue(ptr.nextDocument());
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
assertEquals(11, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(45, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(56, termData.metadata());
assertEquals(IntList.of(3, 5, 7), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
// No more documents
assertFalse(ptr.nextDocument());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testSingleFileIterTwice() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderSingleFile(tempFile);
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 1 */
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(10, ptr.documentSize());
assertEquals(33, ptr.documentMeta());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
// Ensure we can iterate again over the same document without persisting state or closing the pointer
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testFiltered() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
new GammaCodedSequence[]{
gcs(2, 4, 6),
gcs(3, 5, 7),
}
));
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderSingleFile(tempFile).filtering(meta -> meta == 45);
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 2 */
assertTrue(ptr.nextDocument());
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
assertEquals(11, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(45, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
// No more documents
assertFalse(ptr.nextDocument());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testIntegrationScenario() throws IOException {
Map<Long, Integer> wordMap = new HashMap<>();
for (int i = 0; i < 512; i++) {
wordMap.put(hasher.hashKeyword(Integer.toString(i)), i);
}
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
for (int idc = 1; idc < 512; idc++) {
int id = idc;
int[] factors = IntStream
.rangeClosed(1, id)
.filter(v -> (id % v) == 0)
.toArray();
System.out.println("id:" + id + " factors: " + Arrays.toString(factors));
long fullId = UrlIdCodec.encodeId((32 - (id % 32)), id);
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(16);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, i + 1);
}
writer.put(header, new IndexJournalEntryData(keywords, metadata, positions));
}
}
try (var ptr = new IndexJournalReaderSingleFile(tempFile).newPointer()) {
while (ptr.nextDocument()) {
int ordinal = UrlIdCodec.getDocumentOrdinal(ptr.documentId());
System.out.println(ordinal);
var expectedFactors =
new LongArrayList(IntStream
.rangeClosed(1, ordinal)
.filter(v -> (ordinal % v) == 0)
.mapToObj(Integer::toString)
.mapToLong(hasher::hashKeyword)
.toArray());
LongList foundIds = new LongArrayList();
var iter = ptr.iterator();
while (iter.hasNext()) {
var termData = iter.next();
foundIds.add(termData.termId());
}
if (!expectedFactors.equals(foundIds)) {
System.out.println("Found: ");
System.out.println(foundIds.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
System.out.println("Expected: ");
System.out.println(expectedFactors.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
fail();
}
assertEquals(expectedFactors, foundIds);
}
}
}
}

View File

@ -16,11 +16,13 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:slop')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:random-write-funnel')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')
implementation project(':code:processes:converting-process:model')
implementation project(':code:common:process')
implementation project(':third-party:parquet-floor')
@ -34,5 +36,6 @@ dependencies {
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:libraries:test-helpers')
}

View File

@ -1,10 +0,0 @@
package nu.marginalia.index.construction;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import java.io.IOException;
import java.nio.file.Path;
public interface JournalReaderSource {
IndexJournalReader construct(Path sourceFile) throws IOException;
}

View File

@ -2,10 +2,10 @@ package nu.marginalia.index.construction.full;
import lombok.SneakyThrows;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.JournalReaderSource;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -26,20 +26,17 @@ public class FullIndexConstructor {
private final Path outputFileDocs;
private final Path outputFileWords;
private final Path outputFilePositions;
private final JournalReaderSource readerSource;
private final DocIdRewriter docIdRewriter;
private final Path tmpDir;
public FullIndexConstructor(Path outputFileDocs,
Path outputFileWords,
Path outputFilePositions,
JournalReaderSource readerSource,
DocIdRewriter docIdRewriter,
Path tmpDir) {
this.outputFileDocs = outputFileDocs;
this.outputFileWords = outputFileWords;
this.outputFilePositions = outputFilePositions;
this.readerSource = readerSource;
this.docIdRewriter = docIdRewriter;
this.tmpDir = tmpDir;
}
@ -48,8 +45,8 @@ public class FullIndexConstructor {
String processName,
Path sourceBaseDir) throws IOException
{
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir);
if (inputs.isEmpty()) {
var journal = IndexJournal.findJournal(sourceBaseDir);
if (journal.isEmpty()) {
logger.error("No journal files in base dir {}", sourceBaseDir);
return;
}
@ -62,10 +59,12 @@ public class FullIndexConstructor {
AtomicInteger progress = new AtomicInteger(0);
inputs
.parallelStream()
var journalVersions = journal.get().pages();
journalVersions
.stream()
.map(in -> {
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), journalVersions.size());
return construct(in, posConstructor);
})
.reduce(this::merge)
@ -80,9 +79,9 @@ public class FullIndexConstructor {
}
@SneakyThrows
private FullPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
private FullPreindexReference construct(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor) {
return FullPreindex
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
.constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir)
.closeToReference();
}

View File

@ -8,7 +8,7 @@ import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.IndexJournalPage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -43,7 +43,7 @@ public class FullPreindex {
/** Constructs a new preindex with the data associated with reader. The backing files
* will have randomly assigned names.
*/
public static FullPreindex constructPreindex(IndexJournalReader reader,
public static FullPreindex constructPreindex(IndexJournalPage journalInstance,
PositionsFileConstructor positionsFileConstructor,
DocIdRewriter docIdRewriter,
Path workDir) throws IOException
@ -52,13 +52,13 @@ public class FullPreindex {
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = FullPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
var docs = FullPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
var segments = FullPreindexWordSegments.construct(journalInstance, segmentWordsFile, segmentCountsFile);
var docs = FullPreindexDocuments.construct(docsFile, workDir, journalInstance, docIdRewriter, positionsFileConstructor, segments);
return new FullPreindex(segments, docs);
}
/** Close the associated memory mapped areas and return
* a dehydrated version of this object that can be re-opened
* a dehydrated page of this object that can be re-opened
* later.
*/
public FullPreindexReference closeToReference() {

View File

@ -5,12 +5,13 @@ import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.rwf.RandomFileAssembler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
@ -39,13 +40,13 @@ public class FullPreindexDocuments {
public static FullPreindexDocuments construct(
Path docsFile,
Path workDir,
IndexJournalReader reader,
IndexJournalPage journalInstance,
DocIdRewriter docIdRewriter,
PositionsFileConstructor positionsFileConstructor,
FullPreindexWordSegments segments) throws IOException {
FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter);
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
sortDocsFile(docsFileMap, segments);
@ -68,28 +69,42 @@ public class FullPreindexDocuments {
private static void createUnsortedDocsFile(Path docsFile,
Path workDir,
IndexJournalReader reader,
IndexJournalPage journalInstance,
FullPreindexWordSegments segments,
DocIdRewriter docIdRewriter) throws IOException {
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
final ByteBuffer tempBuffer = ByteBuffer.allocate(65536);
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
var pointer = reader.newPointer())
var docIds = journalInstance.openCombinedId();
var termCounts = journalInstance.openTermCounts();
var termIds = journalInstance.openTermIds();
var termMeta = journalInstance.openTermMetadata();
var positions = journalInstance.openTermPositions())
{
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0);
while (pointer.nextDocument()) {
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
for (var termData : pointer) {
long termId = termData.termId();
while (termCounts.hasRemaining()) {
long docId = docIds.get();
long rankEncodedId = docIdRewriter.rewriteDocId(docId);
long termCount = termCounts.get();
for (int termIdx = 0; termIdx < termCount; termIdx++) {
long termId = termIds.get();
byte meta = termMeta.get();
// Read positions
tempBuffer.clear();
positions.getData(tempBuffer);
tempBuffer.flip();
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
// write position data to the positions file and get the offset
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer());
long encodedPosOffset = positionsFileConstructor.add(meta, tempBuffer);
assembly.put(offset + 0, rankEncodedId);
assembly.put(offset + 1, encodedPosOffset);

View File

@ -5,7 +5,7 @@ import nu.marginalia.array.LongArrayFactory;
import java.io.IOException;
import java.nio.file.Path;
/** This is a dehydrated version of a FullPreIndex, that only
/** This is a dehydrated page of a FullPreIndex, that only
* keeps references to its location on disk but does not hold associated
* memory maps.
*/

View File

@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.IndexJournalPage;
import java.io.IOException;
import java.nio.file.Files;
@ -51,14 +51,20 @@ public class FullPreindexWordSegments {
return ret;
}
public static FullPreindexWordSegments construct(IndexJournalReader reader,
public static FullPreindexWordSegments construct(IndexJournalPage journalInstance,
Path wordIdsFile,
Path countsFile)
throws IOException
{
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
countsMap.defaultReturnValue(0);
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
try (var termIds = journalInstance.openTermIds()) {
while (termIds.hasRemaining()) {
countsMap.addTo(termIds.get(), 1);
}
}
LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());

View File

@ -2,8 +2,8 @@ package nu.marginalia.index.construction.prio;
import lombok.SneakyThrows;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.JournalReaderSource;
import nu.marginalia.index.journal.IndexJournalFileNames;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.process.control.ProcessHeartbeat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -24,18 +24,15 @@ public class PrioIndexConstructor {
private final Path outputFileDocs;
private final Path outputFileWords;
private final JournalReaderSource readerSource;
private final DocIdRewriter docIdRewriter;
private final Path tmpDir;
public PrioIndexConstructor(Path outputFileDocs,
Path outputFileWords,
JournalReaderSource readerSource,
DocIdRewriter docIdRewriter,
Path tmpDir) {
this.outputFileDocs = outputFileDocs;
this.outputFileWords = outputFileWords;
this.readerSource = readerSource;
this.docIdRewriter = docIdRewriter;
this.tmpDir = tmpDir;
}
@ -44,8 +41,8 @@ public class PrioIndexConstructor {
String processName,
Path sourceBaseDir) throws IOException
{
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir);
if (inputs.isEmpty()) {
var journal = IndexJournal.findJournal(sourceBaseDir);
if (journal.isEmpty()) {
logger.error("No journal files in base dir {}", sourceBaseDir);
return;
}
@ -57,10 +54,12 @@ public class PrioIndexConstructor {
AtomicInteger progress = new AtomicInteger(0);
inputs
.parallelStream()
var journalVersions = journal.get().pages();
journalVersions
.stream()
.map(in -> {
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), journalVersions.size());
return construct(in);
})
.reduce(this::merge)
@ -75,9 +74,9 @@ public class PrioIndexConstructor {
}
@SneakyThrows
private PrioPreindexReference construct(Path input) {
private PrioPreindexReference construct(IndexJournalPage journalInstance) {
return PrioPreindex
.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir)
.constructPreindex(journalInstance, docIdRewriter, tmpDir)
.closeToReference();
}

View File

@ -6,7 +6,7 @@ import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.index.ReverseIndexParameters;
import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.IndexJournalPage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -16,7 +16,8 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import static nu.marginalia.array.algo.TwoArrayOperations.*;
import static nu.marginalia.array.algo.TwoArrayOperations.countDistinctElements;
import static nu.marginalia.array.algo.TwoArrayOperations.mergeArrays;
/** Contains the data that would go into a reverse index,
* that is, a mapping from words to documents, minus the actual
@ -41,7 +42,7 @@ public class PrioPreindex {
/** Constructs a new preindex with the data associated with reader. The backing files
* will have randomly assigned names.
*/
public static PrioPreindex constructPreindex(IndexJournalReader reader,
public static PrioPreindex constructPreindex(IndexJournalPage indexJournalPage,
DocIdRewriter docIdRewriter,
Path workDir) throws IOException
{
@ -49,13 +50,13 @@ public class PrioPreindex {
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments);
var segments = PrioPreindexWordSegments.construct(indexJournalPage, segmentWordsFile, segmentCountsFile);
var docs = PrioPreindexDocuments.construct(docsFile, workDir, indexJournalPage, docIdRewriter, segments);
return new PrioPreindex(segments, docs);
}
/** Close the associated memory mapped areas and return
* a dehydrated version of this object that can be re-opened
* a dehydrated page of this object that can be re-opened
* later.
*/
public PrioPreindexReference closeToReference() {

View File

@ -4,7 +4,7 @@ import lombok.SneakyThrows;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.rwf.RandomFileAssembler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -37,11 +37,11 @@ public class PrioPreindexDocuments {
public static PrioPreindexDocuments construct(
Path docsFile,
Path workDir,
IndexJournalReader reader,
IndexJournalPage journalInstance,
DocIdRewriter docIdRewriter,
PrioPreindexWordSegments segments) throws IOException {
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter);
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
sortDocsFile(docsFileMap, segments);
@ -54,37 +54,41 @@ public class PrioPreindexDocuments {
}
public LongArray slice(long start, long end) {
return documents.range(start, end);
}
public long size() {
return documents.size();
}
private static void createUnsortedDocsFile(Path docsFile,
Path workDir,
IndexJournalReader reader,
IndexJournalPage journalInstance,
PrioPreindexWordSegments segments,
DocIdRewriter docIdRewriter) throws IOException {
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
var pointer = reader.newPointer())
var docIds = journalInstance.openCombinedId();
var termIdsCounts = journalInstance.openTermCounts();
var termIds = journalInstance.openTermIds();
var termMeta = journalInstance.openTermMetadata())
{
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0);
while (pointer.nextDocument()) {
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
for (var termData : pointer) {
long termId = termData.termId();
while (docIds.hasRemaining()) {
long docId = docIds.get();
long rankEncodedId = docIdRewriter.rewriteDocId(docId);
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
long termCount = termIdsCounts.get();
for (int termIdx = 0; termIdx < termCount; termIdx++) {
long termId = termIds.get();
byte meta = termMeta.get();
assembly.put(offset, rankEncodedId);
if (meta != 0) {
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
assembly.put(offset, rankEncodedId);
}
}
}

View File

@ -5,7 +5,7 @@ import nu.marginalia.array.LongArrayFactory;
import java.io.IOException;
import java.nio.file.Path;
/** This is a dehydrated version of a PrioPreIndex, that only
/** This is a dehydrated page of a PrioPreIndex, that only
* keeps references to its location on disk but does not hold associated
* memory maps.
*/

View File

@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.IndexJournalPage;
import java.io.IOException;
import java.nio.file.Files;
@ -51,14 +51,26 @@ public class PrioPreindexWordSegments {
return ret;
}
public static PrioPreindexWordSegments construct(IndexJournalReader reader,
public static PrioPreindexWordSegments construct(IndexJournalPage journalInstance,
Path wordIdsFile,
Path countsFile)
throws IOException
{
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
countsMap.defaultReturnValue(0);
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
try (var termIds = journalInstance.openTermIds();
var termMetas = journalInstance.openTermMetadata()) {
while (termIds.hasRemaining()) {
long data = termIds.get();
byte meta = termMetas.get();
if (meta != 0) {
countsMap.addTo(data, 1);
}
}
}
LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());

View File

@ -2,6 +2,7 @@ package nu.marginalia.index;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.construction.full.FullPreindex;
@ -45,6 +46,11 @@ class FullReverseIndexReaderTest {
Files.delete(tempDir);
}
MurmurHash3_128 hash = new MurmurHash3_128();
long termId(String keyword) {
return hash.hashKeyword(keyword);
}
@Test
public void testSimple() throws IOException {
@ -52,18 +58,19 @@ class FullReverseIndexReaderTest {
new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5))
);
assertEquals(1, indexReader.numDocuments(50));
assertEquals(1, indexReader.numDocuments(termId("50")));
var positions = indexReader.getTermData(Arena.global(), 50, new long[] { 100 });
var positions = indexReader.getTermData(Arena.global(), termId("50"), new long[] { 100 });
assertEquals(1, positions.length);
assertNotNull(positions[0]);
assertEquals((byte) 51, positions[0].flags());
assertEquals(IntList.of(1, 3, 5), positions[0].positions().values());
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50")));
}
@Test
public void test2x2() throws IOException {
@ -72,13 +79,13 @@ class FullReverseIndexReaderTest {
new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54))
);
assertEquals(1, indexReader.numDocuments(50));
assertEquals(2, indexReader.numDocuments(51));
assertEquals(1, indexReader.numDocuments(52));
assertEquals(1, indexReader.numDocuments(termId("50")));
assertEquals(2, indexReader.numDocuments(termId("51")));
assertEquals(1, indexReader.numDocuments(termId("52")));
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51));
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52));
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50")));
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, termId("51")));
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, termId("52")));
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
@ -53,33 +54,9 @@ class FullPreindexDocsTest {
Files.delete(tempDir);
}
@Test
public void testDocs() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
new TestSegmentData(10, 2, 4, new long[] { -0xF00BA3L, 0 }),
new TestSegmentData(33, 4, 6, new long[] { -0xF00BA3L, 0 }),
new TestSegmentData(40, 6, 8, new long[] { -0xF00BA3L, 0 })
);
List<TestSegmentData> actual = new ArrayList<>();
var iter = segments.iterator(2);
while (iter.next()) {
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
data));
}
assertEquals(expected, actual);
MurmurHash3_128 hash = new MurmurHash3_128();
long termId(String keyword) {
return hash.hashKeyword(keyword);
}
@Test
@ -94,7 +71,7 @@ class FullPreindexDocsTest {
segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
new TestSegmentData(termId("4"), 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
);
List<TestSegmentData> actual = new ArrayList<>();

View File

@ -3,6 +3,7 @@ package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.model.BTreeHeader;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
@ -12,9 +13,11 @@ import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@ -51,6 +54,11 @@ class FullPreindexFinalizeTest {
Files.delete(tempDir);
}
MurmurHash3_128 hash = new MurmurHash3_128();
long termId(String keyword) {
return hash.hashKeyword(keyword);
}
@Test
public void testFinalizeSimple() throws IOException {
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
@ -81,7 +89,7 @@ class FullPreindexFinalizeTest {
assertEquals(1, wordsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
}
@ -121,8 +129,8 @@ class FullPreindexFinalizeTest {
long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1);
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
BTreeHeader docsHeader;

View File

@ -1,435 +0,0 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
class FullPreindexMergeTest {
TestJournalFactory journalFactory;
Path countsFile;
Path wordsIdFile;
Path docsFile;
Path tempDir;
Path positionsFile;
@BeforeEach
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
positionsFile = Files.createTempFile("positions", ".dat");
countsFile = Files.createTempFile("counts", ".dat");
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
}
@AfterEach
public void tearDown() throws IOException {
journalFactory.clear();
Files.deleteIfExists(countsFile);
Files.deleteIfExists(wordsIdFile);
List<Path> contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
Files.delete(tempFile);
}
Files.delete(tempDir);
}
public FullPreindex runMergeScenario(
List<EntryDataWithWordMeta> leftData,
List<EntryDataWithWordMeta> rightData
) throws IOException {
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
var left = FullPreindex.constructPreindex(reader1, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
var right = FullPreindex.constructPreindex(reader2, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
return FullPreindex.merge(tempDir, left, right);
}
private List<TestSegmentData> getData(FullPreindex merged) {
var iter = merged.segments.iterator(2);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
merged.documents.slice(iter.startOffset, iter.endOffset).get(0, data);
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
data));
}
return actual;
}
@Test
@Disabled
public void testDocsMergeSingleNoOverlap() throws IOException {
IdSequence docIds = new IdSequence();
IdSequence docMetas = new IdSequence();
IdSequence wordMetas = new IdSequence();
IdSequence wordIds = new IdSequence();
var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
var merged = runMergeScenario(
leftSequence,
rightSequence
);
var actual = getData(merged);
var expected = simulateMerge(leftSequence, rightSequence);
System.out.println(actual);
assertEquals(expected, actual);
}
@Test
@Disabled
public void testDocsMergeSingleOnlyOverlap() throws IOException {
IdSequence docIds = new IdSequence();
IdSequence docMetas = new IdSequence();
IdSequence wordMetas = new IdSequence();
IdSequence wordIds = new IdSequence();
var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.alreadySeenSameSequence(), wordMetas.nextUnique())));
var merged = runMergeScenario(
leftSequence,
rightSequence
);
var actual = getData(merged);
var expected = simulateMerge(leftSequence, rightSequence);
System.out.println(actual);
assertEquals(expected, actual);
}
@Test
@Disabled
public void testDocsMergeSingleOnlyOverlap2() throws IOException {
long wid1 = 1;
long wid2 = 2;
IdSequence docIds = new IdSequence();
IdSequence docMetas = new IdSequence();
IdSequence wordMetas = new IdSequence();
var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(),
wm(wid1, wordMetas.nextUnique()),
wm(wid2, wordMetas.nextUnique())
));
var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(),
wm(wid1, wordMetas.nextUnique()),
wm(wid2, wordMetas.nextUnique())
));
var merged = runMergeScenario(
leftSequence,
rightSequence
);
var actual = getData(merged);
var expected = simulateMerge(leftSequence, rightSequence);
System.out.println(actual);
assertEquals(expected, actual);
}
@Test
@Disabled
public void testBadCase1() throws IOException {
long wordId = 0xF00F00BA3L;
List<EntryDataWithWordMeta> leftSequence = List.of(new EntryDataWithWordMeta(40, 50,
wm(wordId, 5))
);
List<EntryDataWithWordMeta> rightSequence = List.of(new EntryDataWithWordMeta(41, 51,
wm(wordId, 3),
wm(wordId, 4))
);
var mergedLR = runMergeScenario(
leftSequence,
rightSequence
);
var mergedRL = runMergeScenario(
rightSequence,
leftSequence
);
var actualLR = getData(mergedLR);
var actualRL = getData(mergedRL);
var expected = simulateMerge(leftSequence, rightSequence);
assertEquals(actualLR, actualRL);
if (!expected.equals(actualLR)) {
System.out.println("*fail*");
System.out.println(leftSequence);
System.out.println(rightSequence);
}
else {
System.out.println("*pass*");
}
assertEquals(expected, actualLR);
}
@Test
@Disabled
public void testBadCase2() throws IOException {
long wordId = 100;
List<EntryDataWithWordMeta> leftSequence = List.of(
new EntryDataWithWordMeta(1, 50, wm(wordId, 5)),
new EntryDataWithWordMeta(2, 50, wm(wordId, 5))
);
List<EntryDataWithWordMeta> rightSequence = List.of(
new EntryDataWithWordMeta(3, 50, wm(wordId, 5))
);
var mergedLR = runMergeScenario(
leftSequence,
rightSequence
);
var mergedRL = runMergeScenario(
rightSequence,
leftSequence
);
var actualLR = getData(mergedLR);
var actualRL = getData(mergedRL);
var expected = simulateMerge(leftSequence, rightSequence);
assertEquals(actualLR, actualRL);
if (!expected.equals(actualLR)) {
System.out.println("*fail*");
System.out.println(leftSequence);
System.out.println(rightSequence);
}
else {
System.out.println("*pass*");
}
assertEquals(expected, actualLR);
}
@Test
@Disabled
public void testFuzz() throws IOException {
Random r = new Random();
int maxDocs = 150;
int maxWords = 160;
int nIters = 1000;
for (int i = 0; i < nIters; i++) {
int nLeft = 1 + r.nextInt(maxDocs);
int nRight = 1 + r.nextInt(maxDocs);
IdSequence docIdsLeft = new IdSequence();
IdSequence docIdsRight = new IdSequence();
IdSequence docMetas = new IdSequence();
IdSequence wordMetas = new IdSequence();
IdSequence wordIds = new IdSequence();
List<EntryDataWithWordMeta> leftSequence = new ArrayList<>(nLeft);
for (int j = 0; j < nLeft; j++) {
WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)];
Arrays.setAll(words, idx -> {
long wordId = wordIds.seenWithP(1.0);
long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId);
return wm(wordId, wordMeta);
});
long docId = docIdsLeft.nextUnique();
long docMeta = docMetas.nextUniqueAssociatedWithKey(docId);
leftSequence.add(new EntryDataWithWordMeta(docId, docMeta, words));
}
List<EntryDataWithWordMeta> rightSequence = new ArrayList<>(nLeft);
for (int j = 0; j < nRight; j++) {
WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)];
Arrays.setAll(words, idx -> {
long wordId = wordIds.seenWithP(1.0);
long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId);
return wm(wordId, wordMeta);
});
long docId = docIdsRight.seenWithP(docIdsLeft, 0.1);
long docMeta = docMetas.nextUniqueAssociatedWithKey(docId);
rightSequence.add(new EntryDataWithWordMeta(docId, docMeta, words));
}
var mergedLR = runMergeScenario(
leftSequence,
rightSequence
);
var mergedRL = runMergeScenario(
rightSequence,
leftSequence
);
var actualLR = getData(mergedLR);
var actualRL = getData(mergedRL);
var expected = simulateMerge(leftSequence, rightSequence);
assertEquals(actualLR, actualRL);
if (!expected.equals(actualLR)) {
System.out.println("*fail*");
System.out.println(leftSequence);
System.out.println(rightSequence);
}
else {
System.out.println("*pass*");
}
assertEquals(expected, actualLR);
}
}
public List<TestSegmentData> simulateMerge(
Collection<EntryDataWithWordMeta> leftInputs,
Collection<EntryDataWithWordMeta> rightInputs
) {
TreeMap<Long, List<DocWithMeta>> wordToDocs = new TreeMap<>();
for (var entry : leftInputs) {
for (var wm : entry.wordIds()) {
wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add(
new DocWithMeta(entry.docId(), wm.meta())
);
}
}
for (var entry : rightInputs) {
for (var wm : entry.wordIds()) {
wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add(
new DocWithMeta(entry.docId(), wm.meta())
);
}
}
List<TestSegmentData> ret = new ArrayList<>();
int[] start = new int[1];
wordToDocs.forEach((wordId, docsList) -> {
docsList.sort(Comparator.naturalOrder());
var iter = docsList.iterator();
DocWithMeta prevVal = null;
DocWithMeta currentVal;
while (iter.hasNext()) {
currentVal = iter.next();
if (prevVal != null) {
if (currentVal.docId == prevVal.docId) {
iter.remove();
}
}
prevVal = currentVal;
}
long[] data = new long[docsList.size()*2];
for (int i = 0; i < docsList.size(); i++) {
data[2*i] = docsList.get(i).docId;
data[2*i + 1] = docsList.get(i).meta;
}
ret.add(new TestSegmentData(wordId, start[0], start[0] + data.length, data));
start[0] += data.length;
});
return ret;
}
record DocWithMeta(long docId, long meta) implements Comparable<DocWithMeta> {
@Override
public int compareTo(DocWithMeta o) {
return Long.compare(docId, o.docId);
}
}
class IdSequence {
Set<Long> seen = new HashSet<>();
Map<Long, Long> associatedValues = new HashMap<>();
private Random random = new Random();
/** Return alreadySeen() with probability p,
* else nextUnique()
*/
public long seenWithP(double p) {
if (isEmpty() || random.nextDouble() > p)
return nextUnique();
return alreadySeenSameSequence();
}
public long seenWithP(IdSequence other, double p) {
if (isEmpty() || random.nextDouble() > p)
return nextUnique();
return alreadySeenOtherSequence(other);
}
public long nextUnique() {
for (;;) {
long val = random.nextLong();
if (seen.add(val)) {
return val;
}
}
}
public long nextUniqueAssociatedWithKey(long key) {
return associatedValues.computeIfAbsent(key, k -> nextUnique());
}
public long alreadySeenSameSequence() {
long[] values = seen.stream().mapToLong(Long::longValue).toArray();
int idx = random.nextInt(0, values.length);
return values[idx];
}
public long alreadySeenOtherSequence(IdSequence other) {
List<Long> values = new ArrayList<>(other.seen);
Collections.shuffle(values);
for (Long maybe : values) {
if (seen.add(maybe))
return maybe;
}
return nextUnique();
}
public boolean isEmpty() {
return seen.isEmpty();
}
}
}

View File

@ -1,231 +0,0 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArray;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static org.junit.jupiter.api.Assertions.*;
class FullPreindexWordSegmentsTest {
Path countsFile;
Path wordsIdFile;
Path docsFile;
Path tempDir;
TestJournalFactory journalFactory;
@BeforeEach
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
countsFile = Files.createTempFile("counts", ".dat");
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
}
@AfterEach
public void tearDown() throws IOException {
journalFactory.clear();
Files.deleteIfExists(countsFile);
Files.deleteIfExists(wordsIdFile);
List<Path> contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
Files.delete(tempFile);
}
Files.delete(tempDir);
}
@Test
public void testWordSegmentsLongWordId() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 1L<<33)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
new TestSegmentData(1L<<33, 0, 1)
);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
}
assertEquals(expected, actual);
}
@Test
public void testWordSegmentsRepeatedWordId() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 5, 5)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
new TestSegmentData(5, 0, 2)
);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
}
assertEquals(expected, actual);
}
@Test
public void testWordSegments1() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 1),
new TestSegmentData(10, 1, 2),
new TestSegmentData(33, 2, 3),
new TestSegmentData(40, 3, 4)
);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
}
assertEquals(expected, actual);
}
@Test
public void testWordSegments2() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33),
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 2),
new TestSegmentData(10, 2, 3),
new TestSegmentData(15, 3, 4),
new TestSegmentData(30, 4, 5),
new TestSegmentData(33, 5, 7),
new TestSegmentData(40, 7, 8)
);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
}
assertEquals(expected, actual);
}
@Test
public void testWordSegments_ReadIterator() {
LongArray wordsArray = LongArray.allocate(4);
LongArray countsArray = LongArray.allocate(4);
wordsArray.set(0, -1, -2, -3, -4);
countsArray.set(0, 2, 1, 3, 5);
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
var ritr = segments.iterator(1);
assertTrue(ritr.hasMorePositions());
assertTrue(ritr.next());
assertTrue(ritr.isPositionBeforeEnd());
assertEquals(-1, ritr.wordId);
assertEquals(0, ritr.idx());
assertEquals(0, ritr.startOffset);
assertEquals(2, ritr.endOffset);
assertTrue(ritr.hasMorePositions());
assertTrue(ritr.next());
assertTrue(ritr.isPositionBeforeEnd());
assertEquals(-2, ritr.wordId);
assertEquals(1, ritr.idx());
assertEquals(2, ritr.startOffset);
assertEquals(3, ritr.endOffset);
assertTrue(ritr.hasMorePositions());
assertTrue(ritr.next());
assertTrue(ritr.isPositionBeforeEnd());
assertEquals(-3, ritr.wordId);
assertEquals(2, ritr.idx());
assertEquals(3, ritr.startOffset);
assertEquals(6, ritr.endOffset);
assertTrue(ritr.hasMorePositions());
assertTrue(ritr.next());
assertTrue(ritr.isPositionBeforeEnd());
assertEquals(-4, ritr.wordId);
assertEquals(3, ritr.idx());
assertEquals(6, ritr.startOffset);
assertEquals(11, ritr.endOffset);
assertFalse(ritr.hasMorePositions());
assertFalse(ritr.next());
assertFalse(ritr.isPositionBeforeEnd());
assertEquals(Long.MIN_VALUE, ritr.wordId);
}
@Test
public void testWordSegments_ConstructionIterator() {
LongArray wordsArray = LongArray.allocate(4);
LongArray countsArray = LongArray.allocate(4);
wordsArray.set(0, -1, -2, -3, -4);
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
var citr = segments.constructionIterator(1);
assertEquals(-1, citr.wordId);
assertEquals(0, citr.idx());
assertTrue(citr.canPutMore());
assertTrue(citr.putNext(1));
assertEquals(1, countsArray.get(0));
assertEquals(-2, citr.wordId);
assertEquals(1, citr.idx());
assertTrue(citr.canPutMore());
assertTrue(citr.putNext(2));
assertEquals(2, countsArray.get(1));
assertEquals(-3, citr.wordId);
assertEquals(2, citr.idx());
assertTrue(citr.canPutMore());
assertTrue(citr.putNext(3));
assertEquals(3, countsArray.get(2));
assertEquals(-4, citr.wordId);
assertEquals(3, citr.idx());
assertTrue(citr.canPutMore());
assertFalse(citr.putNext(4));
assertEquals(4, countsArray.get(3));
assertEquals(4, citr.idx());
assertFalse(citr.canPutMore());
assertEquals(Long.MIN_VALUE, citr.wordId);
}
}

View File

@ -1,17 +1,15 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.test.TestUtil;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
@ -22,17 +20,13 @@ public class TestJournalFactory {
public TestJournalFactory() throws IOException {}
public void clear() throws IOException {
List<Path> toDelete = new ArrayList<>();
try (var dirStream = Files.list(tempDir)) {
dirStream.forEach(toDelete::add);
}
for (var tempFile : toDelete) {
Files.delete(tempFile);
}
Files.delete(tempDir);
TestUtil.clearTempDir(tempDir);
}
public record EntryData(long docId, long docMeta, long... wordIds) {
public record EntryData(long docId, long docMeta, String... wordIds) {
public EntryData(long docId, long docMeta, long... wordIds) {
this(docId, docMeta, Arrays.stream(wordIds).mapToObj(String::valueOf).toArray(String[]::new));
}
@Override
public String toString() {
return "EntryData{" +
@ -52,19 +46,23 @@ public class TestJournalFactory {
'}';
}
}
public record WordWithMeta(long wordId, long meta, GammaCodedSequence gcs) {}
public static WordWithMeta wm(long wordId, long meta, int... positions) {
return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
public record WordWithMeta(String wordId, byte meta, GammaCodedSequence gcs) {
public WordWithMeta(long wordId, byte meta, GammaCodedSequence gcs) {
this(String.valueOf(wordId), meta, gcs);
}
}
public IndexJournalReader createReader(EntryData... entries) throws IOException {
Path jf = Files.createTempFile(tempDir, "journal", ".dat");
public static WordWithMeta wm(long wordId, int meta, int... positions) {
return new WordWithMeta(wordId, (byte) meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
}
var writer = new IndexJournalWriterSingleFileImpl(jf);
public IndexJournalPage createReader(EntryData... entries) throws IOException {
Path ji = Files.createTempDirectory(tempDir, "journal");
var writer = new IndexJournalSlopWriter(ji, 0);
for (var entry : entries) {
long[] termIds = new long[entry.wordIds.length];
long[] meta = new long[entry.wordIds.length];
String[] termIds = new String[entry.wordIds.length];
byte[] meta = new byte[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) {
@ -73,22 +71,35 @@ public class TestJournalFactory {
positions[i] = new GammaCodedSequence(new byte[1]);
}
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta),
new IndexJournalEntryData(termIds, meta, positions));
writer.put(
entry.docId,
new SlopDocumentRecord.KeywordsProjection(
"test",
-1,
0,
entry.docMeta,
15,
Arrays.asList(termIds),
meta,
Arrays.asList(positions),
new byte[0],
List.of()
)
);
}
writer.close();
var ret = new IndexJournalReaderSingleFile(jf);
return ret;
return new IndexJournalPage(ji, 0);
}
public IndexJournalReader createReader(EntryDataWithWordMeta... entries) throws IOException {
Path jf = Files.createTempFile(tempDir, "journal", ".dat");
public IndexJournalPage createReader(EntryDataWithWordMeta... entries) throws IOException {
Path ji = Files.createTempDirectory(tempDir, "journal");
var writer = new IndexJournalWriterSingleFileImpl(jf);
var writer = new IndexJournalSlopWriter(ji, 0);
for (var entry : entries) {
long[] termIds = new long[entry.wordIds.length];
long[] meta = new long[entry.wordIds.length];
String[] termIds = new String[entry.wordIds.length];
byte[] meta = new byte[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) {
termIds[i] = entry.wordIds[i].wordId;
@ -96,11 +107,25 @@ public class TestJournalFactory {
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1]));
}
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta),
new IndexJournalEntryData(termIds, meta, positions));
writer.put(
entry.docId,
new SlopDocumentRecord.KeywordsProjection(
"test",
-1,
0,
entry.docMeta,
15,
Arrays.asList(termIds),
meta,
Arrays.asList(positions),
new byte[0],
List.of()
)
);
}
writer.close();
var ret = new IndexJournalReaderSingleFile(jf);
return ret;
return new IndexJournalPage(ji, 0);
}
}

View File

@ -2,8 +2,8 @@ package nu.marginalia.index.construction.full;
import java.util.Arrays;
record TestSegmentData(long wordId, long start, long end, long[] data) {
public TestSegmentData(long wordId, long start, long end) {
record TestSegmentData(String wordId, long start, long end, long[] data) {
public TestSegmentData(String wordId, long start, long end) {
this(wordId, start, end, null);
}
@ -22,7 +22,7 @@ record TestSegmentData(long wordId, long start, long end, long[] data) {
@Override
public int hashCode() {
int result = (int) (wordId ^ (wordId >>> 32));
int result = wordId.hashCode();
result = 31 * result + (int) (start ^ (start >>> 32));
result = 31 * result + (int) (end ^ (end >>> 32));
result = 31 * result + Arrays.hashCode(data);

View File

@ -1,6 +1,7 @@
package nu.marginalia.index.construction.prio;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.TestJournalFactory;
@ -17,7 +18,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@ -59,6 +60,11 @@ class PrioPreindexTest {
Files.delete(tempDir);
}
MurmurHash3_128 hash = new MurmurHash3_128();
long termId(String keyword) {
return hash.hashKeyword(keyword);
}
@Test
public void testFinalizeSimple() throws IOException {
var journalReader = journalFactory.createReader(
@ -79,7 +85,7 @@ class PrioPreindexTest {
var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile);
var entrySource = indexReader.documents(50);
var entrySource = indexReader.documents(termId("50"));
var lqb = new LongQueryBuffer(32);
entrySource.read(lqb);
@ -139,10 +145,10 @@ class PrioPreindexTest {
var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile);
int items = indexReader.numDocuments(50);
int items = indexReader.numDocuments(termId("50"));
assertEquals(documentIds.length, items);
var entrySource = indexReader.documents(50);
var entrySource = indexReader.documents(termId("50"));
var lqb = new LongQueryBuffer(32);
for (int pos = 0; pos < documentIds.length;) {

View File

@ -1,43 +0,0 @@
package nu.marginalia.test;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
public class TestUtil {
public static void clearTempDir(Path dir) {
if (Files.isDirectory(dir)) {
for (File f : dir.toFile().listFiles()) {
File[] files = f.listFiles();
if (files != null) {
Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
}
System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")");
f.delete();
}
}
System.out.println("Deleting " + dir);
dir.toFile().delete();
}
private static String fileSize(Path path) {
try {
long sizeBytes = Files.size(path);
if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
return sizeBytes + "b";
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
private static String round(double d) {
return String.format("%.2f", d);
}
}

View File

@ -3,11 +3,11 @@ package nu.marginalia.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.ForwardIndexReader;
import java.io.IOException;
import java.nio.file.Files;
@ -56,7 +56,8 @@ public class IndexFactory {
public ForwardIndexReader getForwardIndexReader() throws IOException {
return new ForwardIndexReader(
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT),
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT)
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT),
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.CURRENT)
);
}

View File

@ -13,7 +13,9 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.*;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultSet;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
@ -22,9 +24,9 @@ import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.searchset.SmallSearchSet;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -32,7 +34,8 @@ import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.sql.SQLException;
import java.util.*;
import java.util.BitSet;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
@ -142,7 +145,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
for (var score : rawResult.keywordScores) {
rawItem.addKeywordScores(
RpcResultKeywordScore.newBuilder()
.setEncodedWordMetadata(score.encodedWordMetadata())
.setFlags(score.flags)
.setPositions(score.positionCount)
.setKeyword(score.keyword)
);
}

View File

@ -90,7 +90,7 @@ public class StatefulIndex {
return combinedIndexReader != null;
}
/** Stronger version of isAvailable() that also checks that the index is loaded */
/** Stronger page of isAvailable() that also checks that the index is loaded */
public boolean isLoaded() {
return combinedIndexReader != null && combinedIndexReader.isLoaded();
}

View File

@ -1,13 +1,16 @@
package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.*;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
@ -15,13 +18,13 @@ import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
/** This class is responsible for calculating the score of a search result.
* It holds the data required to perform the scoring, as there is strong
@ -102,7 +105,7 @@ public class IndexResultScoreCalculator {
}
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent);
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
@ -139,27 +142,27 @@ public class IndexResultScoreCalculator {
}
return booleanAggregate(queryGraphScores,
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
flags -> meetsQueryStrategyRequirements((byte) flags, queryParams.queryStrategy()));
}
private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) {
private boolean meetsQueryStrategyRequirements(byte flags, QueryStrategy queryStrategy) {
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
return WordFlags.Site.isPresent(wordMeta);
return WordFlags.Site.isPresent(flags);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
return WordFlags.Subjects.isPresent(wordMeta);
return WordFlags.Subjects.isPresent(flags);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
return WordFlags.Title.isPresent(wordMeta);
return WordFlags.Title.isPresent(flags);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
return WordFlags.UrlPath.isPresent(wordMeta);
return WordFlags.UrlPath.isPresent(flags);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
return WordFlags.UrlDomain.isPresent(wordMeta);
return WordFlags.UrlDomain.isPresent(flags);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
return WordFlags.ExternalLink.isPresent(wordMeta);
return WordFlags.ExternalLink.isPresent(flags);
}
return true;
}

View File

@ -13,10 +13,8 @@ import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.linkdb.docs.DocumentDbReader;
@ -27,9 +25,10 @@ import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
@ -63,7 +62,7 @@ public class CombinedIndexReaderTest {
StatefulIndex statefulIndex;
@Inject
IndexJournalWriter indexJournalWriter;
IndexJournalSlopWriter indexJournalWriter;
@Inject
FileStorageService fileStorageService;
@ -248,7 +247,6 @@ public class CombinedIndexReaderTest {
outputFileDocs,
outputFileWords,
outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(),
tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
@ -268,7 +266,6 @@ public class CombinedIndexReaderTest {
outputFileDocs,
outputFileWords,
outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(),
tmpDir);
@ -279,12 +276,14 @@ public class CombinedIndexReaderTest {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
IndexJournalReader.paging(workDir),
outputFileDocsId,
outputFileDocsData,
outputFileSpansData,
IndexJournal.findJournal(workDir).orElseThrow(),
domainRankings
);
@ -318,19 +317,26 @@ public class CombinedIndexReaderTest {
var meta = metaByDoc.get(doc);
var header = new IndexJournalEntryHeader(
doc,
meta.features,
100,
meta.documentMetadata.encode()
);
List<String> keywords = words.stream().map(w -> w.keyword).toList();
byte[] metadata = new byte[words.size()];
for (int i = 0; i < words.size(); i++) {
metadata[i] = words.get(i).termMetadata;
}
var positions = words.stream().map(w -> w.positions).map(pos -> (CodedSequence) GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toList();
String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new);
long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray();
var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toArray(GammaCodedSequence[]::new);
indexJournalWriter.put(header,
new IndexJournalEntryData(keywords, metadata, positions));
indexJournalWriter.put(doc,
new SlopDocumentRecord.KeywordsProjection(
"",
-1,
meta.features,
meta.documentMetadata.encode(),
100,
keywords,
metadata,
positions,
new byte[0],
List.of()
));
});
var linkdbWriter = new DocumentDbWriter(
@ -370,10 +376,10 @@ public class CombinedIndexReaderTest {
}
record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {}
record MockDataKeyword(String keyword, long termMetadata, IntList positions) {}
record MockDataKeyword(String keyword, byte termMetadata, IntList positions) {}
MockDataKeyword w(String keyword, WordFlags flags, int... positions) {
return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of(positions));
return new MockDataKeyword(keyword, flags.asBit(), IntList.of(positions));
}
}

View File

@ -4,23 +4,18 @@ import com.google.inject.Guice;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
@ -29,12 +24,16 @@ import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
@ -70,7 +69,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
ServiceHeartbeat heartbeat;
@Inject
IndexJournalWriter indexJournalWriter;
IndexJournalSlopWriter indexJournalWriter;
@Inject
FileStorageService fileStorageService;
@ -296,7 +295,6 @@ public class IndexQueryServiceIntegrationSmokeTest {
outputFileDocs,
outputFileWords,
outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(),
tmpDir);
@ -316,7 +314,6 @@ public class IndexQueryServiceIntegrationSmokeTest {
var constructor = new PrioIndexConstructor(
outputFileDocs,
outputFileWords,
IndexJournalReader::singleFile,
DocIdRewriter.identity(),
tmpDir);
@ -327,12 +324,14 @@ public class IndexQueryServiceIntegrationSmokeTest {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
IndexJournalReader.paging(workDir),
outputFileDocsId,
outputFileDocsData,
outputFileSpansData,
IndexJournal.findJournal(workDir).orElseThrow(),
domainRankings
);
@ -354,32 +353,44 @@ public class IndexQueryServiceIntegrationSmokeTest {
long fullId = fullId(id);
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id),
"test", "test", 0., "HTML5", 0, null, 0, 10
));
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
List<String> keywords = IntStream.of(factors).mapToObj(Integer::toString).toList();
byte[] metadata = new byte[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, factors);
metadata[i] = WordFlags.Title.asBit();
}
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
List<CodedSequence> positions = new ArrayList<>();
ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) {
positions.add(GammaCodedSequence.generate(wa, factors));
}
indexJournalWriter.put(fullId,
new SlopDocumentRecord.KeywordsProjection(
"",
-1,
0,
new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode(),
100,
keywords,
metadata,
positions,
new byte[0],
List.of()
));
}
@SneakyThrows
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
long fullId = UrlIdCodec.encodeId(domain, id);
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, DocumentMetadata.defaultValue());
ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id),
@ -387,18 +398,33 @@ public class IndexQueryServiceIntegrationSmokeTest {
));
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
List<String> keywords = IntStream.of(factors).mapToObj(Integer::toString).toList();
byte[] metadata = new byte[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(16);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, i + 1);
metadata[i] = WordFlags.Title.asBit();
}
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
List<CodedSequence> positions = new ArrayList<>();
ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) {
positions.add(GammaCodedSequence.generate(wa, i + 1));
}
indexJournalWriter.put(fullId,
new SlopDocumentRecord.KeywordsProjection(
"",
-1,
0,
new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode(),
100,
keywords,
metadata,
positions,
new byte[0],
List.of()
));
}
}

View File

@ -5,22 +5,19 @@ import com.google.inject.Inject;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
@ -33,12 +30,14 @@ import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
import org.apache.logging.log4j.util.Strings;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
@ -76,7 +75,7 @@ public class IndexQueryServiceIntegrationTest {
ServiceHeartbeat heartbeat;
@Inject
IndexJournalWriter indexJournalWriter;
IndexJournalSlopWriter indexJournalWriter;
@Inject
FileStorageService fileStorageService;
@ -475,7 +474,6 @@ public class IndexQueryServiceIntegrationTest {
outputFileDocs,
outputFileWords,
outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(),
tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
@ -493,7 +491,6 @@ public class IndexQueryServiceIntegrationTest {
var constructor = new PrioIndexConstructor(
outputFileDocs,
outputFileWords,
IndexJournalReader::singleFile,
DocIdRewriter.identity(),
tmpDir);
@ -504,12 +501,14 @@ public class IndexQueryServiceIntegrationTest {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
IndexJournalReader.paging(workDir),
outputFileDocsId,
outputFileDocsData,
outputFileSpansData,
IndexJournal.findJournal(workDir).orElseThrow(),
domainRankings
);
@ -539,24 +538,32 @@ public class IndexQueryServiceIntegrationTest {
var meta = metaByDoc.get(doc);
var header = new IndexJournalEntryHeader(
doc,
meta.features,
100,
meta.documentMetadata.encode()
);
List<String> keywords = words.stream().map(w -> w.keyword).toList();
String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new);
long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray();
GammaCodedSequence[] positions = new GammaCodedSequence[words.size()]; // FIXME: positions?
ByteBuffer workBuffer = ByteBuffer.allocate(8192);
for (int i = 0; i < positions.length; i++) {
positions[i] = GammaCodedSequence.generate(workBuffer, words.get(i).positions);
byte[] metadata = new byte[keywords.size()];
for (int i = 0; i < words.size(); i++) {
metadata[i] = (byte) words.get(i).termMetadata;
}
indexJournalWriter.put(header,
new IndexJournalEntryData(keywords, metadata, positions));
List<CodedSequence> positions = new ArrayList<>();
ByteBuffer workBuffer = ByteBuffer.allocate(8192);
for (int i = 0; i < words.size(); i++) {
positions.add(GammaCodedSequence.generate(workBuffer, words.get(i).positions));
}
indexJournalWriter.put(doc,
new SlopDocumentRecord.KeywordsProjection(
"",
-1,
meta.features,
meta.documentMetadata.encode(),
100,
keywords,
metadata,
positions,
new byte[0],
List.of()
));
});
var linkdbWriter = new DocumentDbWriter(
@ -599,8 +606,8 @@ public class IndexQueryServiceIntegrationTest {
record MockDataKeyword(String keyword, long termMetadata, IntList positions) {}
public MockDataKeyword w(String keyword, EnumSet<WordFlags> wordFlags, int... positions) {
return new MockDataKeyword(keyword, new WordMetadata(0, wordFlags).encode(), IntList.of(positions));
return new MockDataKeyword(keyword, WordFlags.encode(wordFlags), IntList.of(positions));
}
public MockDataKeyword w(String keyword) { return new MockDataKeyword(keyword, 0L, IntList.of()); }
public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of()); }
public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, flags.asBit(), IntList.of()); }
}

View File

@ -2,21 +2,23 @@ package nu.marginalia.index;
import com.google.inject.AbstractModule;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.searchset.SearchSetAny;
import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.util.TestUtil;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBase;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.service.control.*;
import nu.marginalia.service.ServiceId;
import nu.marginalia.service.control.FakeServiceHeartbeat;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBase;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.test.TestUtil;
import org.mockito.Mockito;
import java.io.IOException;
@ -41,8 +43,10 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule {
slowDir = workDir.resolve("slow");
fastDir = workDir.resolve("fast");
Files.createDirectory(slowDir);
Files.createDirectory(fastDir);
Files.createDirectory(fastDir.resolve("iw"));
}
public void cleanUp() {
@ -75,9 +79,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule {
bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class));
bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterPagingImpl(
IndexLocations.getIndexConstructionArea(fileStorageServiceMock)
));
bind(IndexJournalSlopWriter.class).toInstance(new IndexJournalSlopWriter(IndexJournal.allocateName(fastDir.resolve("iw")), 0));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(
ServiceId.Index,

View File

@ -1,44 +0,0 @@
package nu.marginalia.index.util;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
public class TestUtil {
public static void clearTempDir(Path path) {
if (Files.isDirectory(path)) {
for (File f : path.toFile().listFiles()) {
File[] files = f.listFiles();
if (files != null) {
Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
}
System.out.println("Deleting " + f);
f.delete();
}
}
System.out.println("Deleting " + path + " (" + fileSize(path) + ")");
path.toFile().delete();
}
private static String fileSize(Path path) {
try {
long sizeBytes = Files.size(path);
if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
return sizeBytes + "b";
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
private static String round(double d) {
return String.format("%.2f", d);
}
}

View File

@ -26,6 +26,8 @@ dependencies {
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:libraries:test-helpers')
}
jmh {

View File

@ -4,7 +4,7 @@ import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.util.test.TestUtil;
import nu.marginalia.test.TestUtil;
import org.apache.commons.lang3.ArrayUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag;
@ -13,7 +13,7 @@ import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.Random;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

View File

@ -3,7 +3,7 @@ package nu.marginalia.array.algo;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.util.test.TestUtil;
import nu.marginalia.test.TestUtil;
import org.apache.commons.lang3.ArrayUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag;

View File

@ -1,43 +0,0 @@
package nu.marginalia.util.test;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
public class TestUtil {
public static void clearTempDir(Path dir) {
if (Files.isDirectory(dir)) {
for (File f : dir.toFile().listFiles()) {
File[] files = f.listFiles();
if (files != null) {
Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
}
System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")");
f.delete();
}
}
System.out.println("Deleting " + dir);
dir.toFile().delete();
}
private static String fileSize(Path path) {
try {
long sizeBytes = Files.size(path);
if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
return sizeBytes + "b";
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
private static String round(double d) {
return String.format("%.2f", d);
}
}

View File

@ -1,12 +1,11 @@
package nu.marginalia.sequence;
import blue.strategic.parquet.BinarySerializable;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import java.nio.ByteBuffer;
public interface CodedSequence extends BinarySerializable {
public interface CodedSequence {
byte[] bytes();
IntIterator iterator();

View File

@ -158,7 +158,7 @@ public class GammaCodedSequence implements Iterable<Integer>, CodedSequence {
last = i;
// can't encode zeroes
assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values";
assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values; was " + sequence;
writer.putGamma(delta);
}

View File

@ -1,21 +1,24 @@
package nu.marginalia.language.sentence.tag;
public enum HtmlTag {
SCRIPT(true, false),
STYLE(true, false),
CODE(false, true),
PRE(false, true),
TITLE(false, false),
HEADING(false, false),
NAV(false, false),
HEADER(false, false),
FOOTER(false, false);
SCRIPT('s', true, false),
STYLE('S', true, false),
CODE('c', false, true),
PRE('p', false, true),
TITLE('t', false, false),
HEADING('h', false, false),
NAV('n', false, false),
HEADER('H',false, false),
FOOTER('f', false, false);
public char code;
public boolean exclude;
public boolean nonLanguage;
HtmlTag(boolean exclude, boolean nonLanguage) {
HtmlTag(char code, boolean exclude, boolean nonLanguage) {
this.code = code;
this.exclude = exclude;
this.nonLanguage = nonLanguage;
}
}

View File

@ -15,6 +15,8 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation project(':code:libraries:coded-sequence')
implementation libs.notnull
implementation libs.commons.lang3
implementation libs.fastutil
@ -22,6 +24,7 @@ dependencies {
implementation libs.guava
implementation libs.commons.compress
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito

View File

@ -0,0 +1,121 @@
package nu.marginalia.slop.column.dynamic;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.ColumnFunction;
import nu.marginalia.slop.desc.ColumnType;
import nu.marginalia.slop.desc.StorageType;
import nu.marginalia.slop.storage.Storage;
import nu.marginalia.slop.storage.StorageReader;
import nu.marginalia.slop.storage.StorageWriter;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Path;
public class GammaCodedSequenceColumn {
public static GammaCodedSequenceReader open(Path path, ColumnDesc name) throws IOException {
return new Reader(
Storage.reader(path, name, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment
VarintColumn.open(path, name.createDerivative(ColumnFunction.DATA_LEN,
ColumnType.VARINT_LE,
StorageType.PLAIN)
)
);
}
public static GammaCodedSequenceWriter create(Path path, ColumnDesc name) throws IOException {
return new Writer(
Storage.writer(path, name),
VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA_LEN,
ColumnType.VARINT_LE,
StorageType.PLAIN)
)
);
}
private static class Writer implements GammaCodedSequenceWriter {
private final VarintColumnWriter indexWriter;
private final StorageWriter storage;
public Writer(StorageWriter storage,
VarintColumnWriter indexWriter)
{
this.storage = storage;
this.indexWriter = indexWriter;
}
@Override
public void put(GammaCodedSequence sequence) throws IOException {
var buffer = sequence.buffer();
int length = buffer.remaining();
indexWriter.put(length);
storage.putBytes(buffer);
}
public void close() throws IOException {
indexWriter.close();
storage.close();
}
}
private static class Reader implements GammaCodedSequenceReader {
private final VarintColumnReader indexReader;
private final StorageReader storage;
public Reader(StorageReader reader, VarintColumnReader indexReader) throws IOException {
this.storage = reader;
this.indexReader = indexReader;
}
@Override
public void skip(long positions) throws IOException {
for (int i = 0; i < positions; i++) {
int size = (int) indexReader.get();
storage.skip(size, 1);
}
}
@Override
public boolean hasRemaining() throws IOException {
return indexReader.hasRemaining();
}
public long position() throws IOException {
return indexReader.position();
}
@Override
public GammaCodedSequence get(ByteBuffer workArea) throws IOException {
int size = (int) indexReader.get();
workArea.clear();
workArea.limit(size);
storage.getBytes(workArea);
workArea.flip();
return new GammaCodedSequence(workArea);
}
@Override
public void getData(ByteBuffer workArea) throws IOException {
int size = (int) indexReader.get();
int oldLimit = workArea.limit();
workArea.limit(workArea.position() + size);
storage.getBytes(workArea);
workArea.limit(oldLimit);
}
public void close() throws IOException {
indexReader.close();
storage.close();
}
}
}

View File

@ -0,0 +1,34 @@
package nu.marginalia.slop.column.dynamic;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.slop.column.ColumnReader;
import java.io.IOException;
import java.nio.ByteBuffer;
public interface GammaCodedSequenceReader extends AutoCloseable, ColumnReader {
/** Read the next gamma-coded sequence from the column. Unlike most other
* readers, this method requires an intermediate buffer to use for reading
* the sequence. As this buffer typically needs to be fairly large to accommodate
* the largest possible sequence, it is not practical to allocate a new buffer
* for each call to this method. Instead, the caller should allocate a buffer
* once and reuse it for each call to this method.
*
* @param workArea A buffer to use for reading the sequence.
* @return The next gamma-coded sequence.
*/
CodedSequence get(ByteBuffer workArea) throws IOException;
/** Read just the data portion of the next gamma-coded sequence from the column.
* This method is useful when the caller is only interested in the data portion
* of the sequence and does not want to decode the values.
*
* The position of the buffer is advanced to the end of the data that has just been read,
* and the limit remains the same.
*
* @param workArea A buffer to use for reading the data.
*/
void getData(ByteBuffer workArea) throws IOException;
void close() throws IOException;
}

View File

@ -0,0 +1,11 @@
package nu.marginalia.slop.column.dynamic;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.column.ColumnWriter;
import java.io.IOException;
public interface GammaCodedSequenceWriter extends AutoCloseable, ColumnWriter {
void put(GammaCodedSequence sequence) throws IOException;
void close() throws IOException;
}

View File

@ -47,6 +47,7 @@ public abstract class ColumnType<
public static ColumnType<VarintColumnReader, VarintColumnWriter> VARINT_LE = register("varintle", ByteOrder.LITTLE_ENDIAN, VarintColumn::open, VarintColumn::create);
public static ColumnType<VarintColumnReader, VarintColumnWriter> VARINT_BE = register("varintbe", ByteOrder.BIG_ENDIAN, VarintColumn::open, VarintColumn::create);
public static ColumnType<CustomBinaryColumnReader, CustomBinaryColumnWriter> BYTE_ARRAY_CUSTOM = register("s8[]+custom", ByteOrder.nativeOrder(), CustomBinaryColumn::open, CustomBinaryColumn::create);
public static ColumnType<GammaCodedSequenceReader, GammaCodedSequenceWriter> BYTE_ARRAY_GCS = register("s8[]+gcs", ByteOrder.nativeOrder(), GammaCodedSequenceColumn::open, GammaCodedSequenceColumn::create);
public static ColumnType<StringColumnReader, StringColumnWriter> STRING = register("s8[]+str", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create);
public static ColumnType<StringColumnReader, StringColumnWriter> CSTRING = register("s8+cstr", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create);
public static ColumnType<StringColumnReader, StringColumnWriter> TXTSTRING = register("s8+txt", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create);

View File

@ -13,7 +13,9 @@ public class TestUtil {
return;
if (Files.isDirectory(path)) {
for (File f : path.toFile().listFiles()) {
var contents = path.toFile().listFiles();
for (File f : contents) {
if (f.isDirectory()) {
File[] files = f.listFiles();
if (files != null) {

View File

@ -1,32 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation project(':third-party:parquet-floor')
implementation project(':code:common:config')
implementation project(':code:common:db')
implementation project(':code:common:linkdb')
implementation libs.notnull
implementation libs.trove
implementation libs.bundles.parquet
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -1,16 +0,0 @@
# Crawl Spec
A crawl spec is a list of domains to be crawled. It is a parquet file with the following columns:
- `domain`: The domain to be crawled
- `crawlDepth`: The depth to which the domain should be crawled
- `urls`: A list of known URLs to be crawled
Crawl specs are used to define the scope of a crawl in the absence of known domains.
The [CrawlSpecRecord](java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java) class is
used to represent a record in the crawl spec.
The [CrawlSpecRecordParquetFileReader](java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java)
and [CrawlSpecRecordParquetFileWriter](java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java)
classes are used to read and write the crawl spec parquet files.

View File

@ -1,37 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader;
import nu.marginalia.model.processed.DocumentRecord;
import nu.marginalia.model.processed.DocumentRecordKeywordsProjection;
import nu.marginalia.model.processed.DocumentRecordMetadataProjection;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.stream.Stream;
public class DocumentRecordParquetFileReader {
@NotNull
public static Stream<DocumentRecord> stream(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DocumentRecord.newHydrator()));
}
@NotNull
public static Stream<DocumentRecordKeywordsProjection> streamKeywordsProjection(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DocumentRecordKeywordsProjection.newHydrator()),
DocumentRecordKeywordsProjection.requiredColumns()
);
}
@NotNull
public static Stream<DocumentRecordMetadataProjection> streamMetadataProjection(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DocumentRecordMetadataProjection.newHydrator()),
DocumentRecordMetadataProjection.requiredColumns()
);
}
}

View File

@ -1,24 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.ParquetWriter;
import nu.marginalia.model.processed.DocumentRecord;
import java.io.IOException;
import java.nio.file.Path;
public class DocumentRecordParquetFileWriter implements AutoCloseable {
private final ParquetWriter<DocumentRecord> writer;
public DocumentRecordParquetFileWriter(Path file) throws IOException {
writer = ParquetWriter.writeFile(DocumentRecord.schema,
file.toFile(), DocumentRecord.newDehydrator());
}
public void write(DocumentRecord documentRecord) throws IOException {
writer.write(documentRecord);
}
public void close() throws IOException {
writer.close();
}
}

View File

@ -1,30 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader;
import nu.marginalia.model.processed.DomainLinkRecord;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DomainLinkRecordParquetFileReader {
@NotNull
public static Stream<DomainLinkRecord> stream(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainLinkRecord.newHydrator()));
}
@NotNull
public static Set<String> getDestDomainNames(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainLinkRecord.newDestDomainHydrator()),
List.of("dest"))
.collect(Collectors.toSet());
}
}

View File

@ -1,24 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.ParquetWriter;
import nu.marginalia.model.processed.DomainLinkRecord;
import java.io.IOException;
import java.nio.file.Path;
public class DomainLinkRecordParquetFileWriter implements AutoCloseable {
private final ParquetWriter<DomainLinkRecord> writer;
public DomainLinkRecordParquetFileWriter(Path file) throws IOException {
writer = ParquetWriter.writeFile(DomainLinkRecord.schema,
file.toFile(), DomainLinkRecord.newDehydrator());
}
public void write(DomainLinkRecord domainData) throws IOException {
writer.write(domainData);
}
public void close() throws IOException {
writer.close();
}
}

View File

@ -1,31 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader;
import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.model.processed.DomainWithIp;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Stream;
public class DomainRecordParquetFileReader {
@NotNull
public static Stream<DomainRecord> stream(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainRecord.newHydrator()));
}
@NotNull
public static List<DomainWithIp> getBasicDomainInformation(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()),
List.of("domain", "ip"))
.toList();
}
}

View File

@ -1,24 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.ParquetWriter;
import nu.marginalia.model.processed.DomainRecord;
import java.io.IOException;
import java.nio.file.Path;
public class DomainRecordParquetFileWriter implements AutoCloseable {
private final ParquetWriter<DomainRecord> writer;
public DomainRecordParquetFileWriter(Path file) throws IOException {
writer = ParquetWriter.writeFile(DomainRecord.schema,
file.toFile(), DomainRecord.newDehydrator());
}
public void write(DomainRecord domainData) throws IOException {
writer.write(domainData);
}
public void close() throws IOException {
writer.close();
}
}

Some files were not shown because too many files have changed in this diff Show More