(wip) Extract and encode spans data

Refactoring keyword extraction to extract spans information.

Modifying the intermediate storage of converted data to use the new slop library, which is allows for easier storage of ad-hoc binary data like spans and positions.

This is a bit of a katamari damacy commit that ended up dragging along a bunch of other fairly tangentially related changes that are hard to break out into separate commits after the fact.  Will push as-is to get back to being able to do more isolated work.
This commit is contained in:
Viktor Lofgren 2024-07-27 11:44:13 +02:00
parent 52a9a0d410
commit aebb2652e8
221 changed files with 2584 additions and 4613 deletions

View File

@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies { dependencies {
implementation project(':code:libraries:braille-block-punch-cards') implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:coded-sequence')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j

View File

@ -0,0 +1,32 @@
package nu.marginalia.model.idx;
import nu.marginalia.sequence.CodedSequence;
import java.util.List;
public record CodedWordSpan(byte code, CodedSequence spans) {
public static SplitSpansList fromSplit(String codes, List<CodedSequence> spans) {
return new SplitSpansList(codes, spans);
}
public static SplitSpansList split(List<CodedWordSpan> spanList) {
return new SplitSpansList(
spanList.stream()
.map(CodedWordSpan::code)
.collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(),
spanList.stream()
.map(CodedWordSpan::spans)
.toList()
);
}
public record SplitSpansList(String codes, List<CodedSequence> spans) {
public List<CodedWordSpan> unite() {
if (null == codes) {
return List.of();
}
else {
return codes.chars().mapToObj(c -> new CodedWordSpan((byte) c, spans.get(codes.indexOf(c)))).toList();
}
}
}
}

View File

@ -38,19 +38,27 @@ public enum WordFlags {
ExternalLink ExternalLink
; ;
public int asBit() { public byte asBit() {
return 1 << ordinal(); return (byte) (1 << ordinal());
} }
public boolean isPresent(long value) { public boolean isPresent(byte value) {
return (asBit() & value) > 0; return (asBit() & value) > 0;
} }
public boolean isAbsent(long value) { public boolean isAbsent(byte value) {
return (asBit() & value) == 0; return (asBit() & value) == 0;
} }
public static EnumSet<WordFlags> decode(long encodedValue) { public static byte encode(EnumSet<WordFlags> flags) {
byte ret = 0;
for (WordFlags f : flags) {
ret |= f.asBit();
}
return ret;
}
public static EnumSet<WordFlags> decode(byte encodedValue) {
EnumSet<WordFlags> ret = EnumSet.noneOf(WordFlags.class); EnumSet<WordFlags> ret = EnumSet.noneOf(WordFlags.class);
for (WordFlags f : values()) { for (WordFlags f : values()) {

View File

@ -1,89 +0,0 @@
package nu.marginalia.model.idx;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import java.util.EnumSet;
import java.util.Set;
/** Word level metadata designed to fit in a single 64 bit long.
*
* @param positions bitmask of term positions within the document
* @param flags word flags (see {@link WordFlags})
*/
public record WordMetadata(long positions,
int flags) {
public static final long FLAGS_MASK = (1L << WordFlags.values().length) - 1;
public static final int POSITIONS_COUNT = 64 - WordFlags.values().length;
public static final int POSITIONS_SHIFT = WordFlags.values().length;
public static final long POSITIONS_MASK = ~0L >>> POSITIONS_SHIFT;
public WordMetadata() {
this(emptyValue());
}
public WordMetadata(long value) {
this(
((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
(int)(value & FLAGS_MASK)
);
}
public WordMetadata(long positions,
Set<WordFlags> flags)
{
this(positions, encodeFlags(flags));
}
private static int encodeFlags(Set<WordFlags> flags) {
int ret = 0;
for (var flag : flags) { ret |= flag.asBit(); }
return ret;
}
public static boolean hasFlags(long encoded, long metadataBitMask) {
return (encoded & metadataBitMask) == metadataBitMask;
}
public static boolean hasAnyFlags(long encoded, long metadataBitMask) {
return (encoded & metadataBitMask) != 0;
}
public static long decodePositions(long meta) {
return (meta >>> POSITIONS_SHIFT) & POSITIONS_MASK;
}
public boolean hasFlag(WordFlags flag) {
return (flags & flag.asBit()) != 0;
}
public String toString() {
return "[positions=%s; %s]".formatted(BrailleBlockPunchCards.printBits(positions, 56), flagSet());
}
/* Encoded in a 64 bit long
*/
public long encode() {
long ret = 0;
ret |= Integer.toUnsignedLong(flags) & FLAGS_MASK;
ret |= (positions & POSITIONS_MASK) << POSITIONS_SHIFT;
return ret;
}
public boolean isEmpty() {
return positions == 0 && flags == 0;
}
public static long emptyValue() {
return 0L;
}
public EnumSet<WordFlags> flagSet() {
return WordFlags.decode(flags);
}
}

View File

@ -1,41 +0,0 @@
package nu.marginalia.model;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import org.junit.jupiter.api.Test;
import java.util.EnumSet;
import static org.junit.jupiter.api.Assertions.assertEquals;
class WordMetadataTest {
@Test
public void codecTest() {
verifyCodec("Vanilla case", new WordMetadata(0x7f0f0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("Position 32bit", new WordMetadata(0xff0f0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("Position all", new WordMetadata(0xffff_ff0f_0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("No flags", new WordMetadata( 0xff0f0000L, EnumSet.noneOf(WordFlags.class)));
verifyCodec("No flags, some bits", new WordMetadata(0x3f_7f7f_7f7f_7f7fL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("No flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("All flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0x7f0f0005L, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0xff0f0013L, EnumSet.noneOf(WordFlags.class)));
System.out.println(new WordMetadata(0xf0f000ff0f0013L, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0xf0f000ff0f0013L, (byte)-1));
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(~0L, (byte) 0).encode(), 64));
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(0, (byte) 0xff).encode(), 64));
System.out.println(BrailleBlockPunchCards.printBits(131973L, 64));
System.out.println(new WordMetadata(131973L));
}
public void verifyCodec(String message, WordMetadata data) {
System.out.println(BrailleBlockPunchCards.printBits(data.encode(), 64));
assertEquals(data, new WordMetadata(data.encode()), message);
}
}

View File

@ -38,15 +38,15 @@ dependencies {
implementation project(':code:functions:search-query') implementation project(':code:functions:search-query')
implementation project(':code:execution:api') implementation project(':code:execution:api')
implementation project(':code:process-models:crawl-spec') implementation project(':code:processes:crawling-process:model')
implementation project(':code:process-models:crawling-model') implementation project(':code:processes:crawling-process:model')
implementation project(':code:features-crawl:link-parser') implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:data-extractors') implementation project(':code:features-convert:data-extractors')
implementation project(':code:features-convert:stackexchange-xml') implementation project(':code:features-convert:stackexchange-xml')
implementation project(':code:features-convert:reddit-json') implementation project(':code:features-convert:reddit-json')
implementation project(':code:index:index-journal') implementation project(':code:index:index-journal')
implementation project(':code:index:api') implementation project(':code:index:api')
implementation project(':code:process-mqapi') implementation project(':code:processes:process-mq-api')
implementation project(':third-party:encyclopedia-marginalia-nu') implementation project(':third-party:encyclopedia-marginalia-nu')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j

View File

@ -6,19 +6,11 @@ import com.google.inject.Singleton;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import lombok.With; import lombok.With;
import nu.marginalia.IndexLocations;
import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorResumeBehavior; import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.actor.state.Resume; import nu.marginalia.actor.state.Resume;
import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.svc.BackupService;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.index.api.IndexMqClient; import nu.marginalia.index.api.IndexMqClient;
import nu.marginalia.index.api.IndexMqEndpoints; import nu.marginalia.index.api.IndexMqEndpoints;
import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqMessageState;
@ -27,9 +19,20 @@ import nu.marginalia.mqapi.converting.ConvertRequest;
import nu.marginalia.mqapi.index.CreateIndexRequest; import nu.marginalia.mqapi.index.CreateIndexRequest;
import nu.marginalia.mqapi.index.IndexName; import nu.marginalia.mqapi.index.IndexName;
import nu.marginalia.mqapi.loading.LoadRequest; import nu.marginalia.mqapi.loading.LoadRequest;
import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.svc.BackupService;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.List; import java.util.List;
@ -113,6 +116,21 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
yield new Load(List.of(processedId)); yield new Load(List.of(processedId));
} }
case Load(List<FileStorageId> processedIds, long msgId) when msgId < 0 -> { case Load(List<FileStorageId> processedIds, long msgId) when msgId < 0 -> {
// clear the output directory of the loader from any debris from partial jobs that have been aborted
Files.list(IndexLocations.getIndexConstructionArea(storageService)).forEach(path -> {
try {
if (Files.isDirectory(path)) {
FileUtils.deleteDirectory(path.toFile());
}
else if (Files.isRegularFile(path)) {
Files.delete(path);
}
} catch (Exception e) {
logger.error("Error clearing staging area", e);
}
});
long id = mqLoaderOutbox.sendAsync(new LoadRequest(processedIds)); long id = mqLoaderOutbox.sendAsync(new LoadRequest(processedIds));
yield new Load(processedIds, id); yield new Load(processedIds, id);

View File

@ -2,22 +2,25 @@ package nu.marginalia.svc;
import com.github.luben.zstd.ZstdInputStream; import com.github.luben.zstd.ZstdInputStream;
import com.github.luben.zstd.ZstdOutputStream; import com.github.luben.zstd.ZstdOutputStream;
import com.google.inject.Inject;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.linkdb.LinkdbFileNames; import nu.marginalia.linkdb.LinkdbFileNames;
import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.index.journal.IndexJournalFileNames; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import com.google.inject.Inject; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.List; import java.util.List;
import java.util.Optional;
public class BackupService { public class BackupService {
@ -97,35 +100,20 @@ public class BackupService {
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException private void backupJournal(Path inputStorage, Path backupStorage) throws IOException
{ {
for (var source : IndexJournalFileNames.findJournalFiles(inputStorage)) { Optional<IndexJournal> journal = IndexJournal.findJournal(inputStorage);
var dest = backupStorage.resolve(source.toFile().getName()); if (journal.isEmpty()) {
throw new FileNotFoundException("No journal found in input storage");
try (var is = Files.newInputStream(source);
var os = Files.newOutputStream(dest)
) {
IOUtils.copyLarge(is, os);
}
} }
FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile());
} }
private void restoreJournal(Path destStorage, Path backupStorage) throws IOException { private void restoreJournal(Path destStorage, Path backupStorage) throws IOException {
Optional<IndexJournal> journal = IndexJournal.findJournal(backupStorage);
// Remove any old journal files first to avoid them getting loaded if (journal.isEmpty()) {
for (var garbage : IndexJournalFileNames.findJournalFiles(destStorage)) { throw new FileNotFoundException("No journal found in backup");
Files.delete(garbage);
} }
FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile());
for (var source : IndexJournalFileNames.findJournalFiles(backupStorage)) {
var dest = destStorage.resolve(source.toFile().getName());
try (var is = Files.newInputStream(source);
var os = Files.newOutputStream(dest)
) {
IOUtils.copyLarge(is, os);
}
}
} }
private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException

View File

@ -24,7 +24,7 @@ dependencies {
implementation project(':code:libraries:blocking-thread-pool') implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:features-crawl:link-parser') implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:anchor-keywords') implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:process-models:crawling-model') implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:converting-process') implementation project(':code:processes:converting-process')
implementation project(':third-party:commons-codec') implementation project(':third-party:commons-codec')

View File

@ -3,13 +3,13 @@ package nu.marginalia.extractor;
import com.google.inject.Inject; import com.google.inject.Inject;
import gnu.trove.set.hash.TLongHashSet; import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
import nu.marginalia.link_parser.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorage;

View File

@ -2,13 +2,13 @@ package nu.marginalia.extractor;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.link_parser.FeedExtractor; import nu.marginalia.link_parser.FeedExtractor;
import nu.marginalia.link_parser.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorage;

View File

@ -5,11 +5,11 @@ import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TLongHashSet; import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorage;

View File

@ -7,14 +7,16 @@ import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
import java.util.List;
import java.util.stream.Stream; import java.util.stream.Stream;
public class DocumentKeywordExtractor { public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor; private final KeywordExtractor keywordExtractor;
@ -93,7 +95,7 @@ public class DocumentKeywordExtractor {
var word = rep.word; var word = rep.word;
if (!word.isBlank()) { if (!word.isBlank()) {
long meta = metadata.getMetadataForWord(rep.stemmed); byte meta = metadata.getMetadataForWord(rep.stemmed);
wordsBuilder.addMeta(word, meta); wordsBuilder.addMeta(word, meta);
} }
} }
@ -105,7 +107,13 @@ public class DocumentKeywordExtractor {
{ {
// we use 1-based indexing since the data // we use 1-based indexing since the data
// will be gamma encoded, and it can't represent 0 // will be gamma encoded, and it can't represent 0
int pos = 1; int pos = 0;
List<SpanRecorder> spanRecorders = List.of(
new SpanRecorder(HtmlTag.TITLE),
new SpanRecorder(HtmlTag.HEADING),
new SpanRecorder(HtmlTag.CODE)
);
for (DocumentSentence sent : dld) { for (DocumentSentence sent : dld) {
@ -113,6 +121,12 @@ public class DocumentKeywordExtractor {
break; break;
for (var word : sent) { for (var word : sent) {
pos++;
for (var recorder : spanRecorders) {
recorder.update(sent, pos);
}
if (word.isStopWord()) { if (word.isStopWord()) {
continue; continue;
} }
@ -120,7 +134,7 @@ public class DocumentKeywordExtractor {
String w = word.wordLowerCase(); String w = word.wordLowerCase();
if (matchesWordPattern(w)) { if (matchesWordPattern(w)) {
/* Add information about term positions */ /* Add information about term positions */
wordsBuilder.addPos(w, pos++); wordsBuilder.addPos(w, pos);
/* Add metadata for word */ /* Add metadata for word */
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
@ -130,11 +144,16 @@ public class DocumentKeywordExtractor {
for (var names : keywordExtractor.getProperNames(sent)) { for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names); var rep = new WordRep(sent, names);
long meta = metadata.getMetadataForWord(rep.stemmed); byte meta = metadata.getMetadataForWord(rep.stemmed);
wordsBuilder.addMeta(rep.word, meta); wordsBuilder.addMeta(rep.word, meta);
} }
}
pos++; // we need to add one more position to account for the last word in the document
for (var recorder : spanRecorders) {
wordsBuilder.addSpans(recorder.finish(pos));
} }
} }
@ -176,4 +195,36 @@ public class DocumentKeywordExtractor {
return false; return false;
} }
/** Helper class to record spans of words */
private static class SpanRecorder {
private List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
private final HtmlTag htmlTag;
private int start = 0;
public SpanRecorder(HtmlTag htmlTag) {
this.htmlTag = htmlTag;
}
public void update(DocumentSentence sentence, int pos) {
assert pos > 0;
if (sentence.htmlTags.contains(htmlTag)) {
if (start <= 0) start = pos;
}
else {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = -1;
}
}
}
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
}
return spans;
}
}
} }

View File

@ -27,9 +27,9 @@ class KeywordMetadata {
this.urlKeywords = urlKeywords; this.urlKeywords = urlKeywords;
} }
public long getMetadataForWord(String stemmed) { public byte getMetadataForWord(String stemmed) {
long flags = 0; byte flags = 0;
if (subjectLikeKeywords.contains(stemmed)) { if (subjectLikeKeywords.contains(stemmed)) {
flags |= WordFlags.Subjects.asBit(); flags |= WordFlags.Subjects.asBit();

View File

@ -1,36 +1,36 @@
package nu.marginalia.keyword.model; package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.CodedSequence;
import java.io.Serial; import java.util.List;
import java.io.Serializable;
public final class DocumentKeywords implements Serializable { public final class DocumentKeywords {
@Serial public final List<String> keywords;
private static final long serialVersionUID = 1387282293082091432L; public final byte[] metadata;
public final List<CodedSequence> positions;
public final List<CodedWordSpan> spans;
public final String[] keywords; public DocumentKeywords(List<String> keywords,
public final long[] metadata; byte[] metadata,
public final CodedSequence[] positions; List<CodedSequence> positions,
List<CodedWordSpan> spans)
public DocumentKeywords(String[] keywords,
long[] metadata,
CodedSequence[] positions)
{ {
this.keywords = keywords; this.keywords = keywords;
this.metadata = metadata; this.metadata = metadata;
this.positions = positions; this.positions = positions;
this.spans = spans;
assert keywords.length == metadata.length; assert keywords.size() == metadata.length;
} }
public boolean isEmpty() { public boolean isEmpty() {
return keywords.length == 0; return keywords.isEmpty();
} }
public int size() { public int size() {
return keywords.length; return keywords.size();
} }
} }

View File

@ -1,11 +1,13 @@
package nu.marginalia.keyword.model; package nu.marginalia.keyword.model;
import gnu.trove.list.array.TByteArrayList;
import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
import lombok.Getter; import lombok.Getter;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -16,8 +18,9 @@ import java.util.*;
@Getter @Getter
public class DocumentKeywordsBuilder { public class DocumentKeywordsBuilder {
public final Object2LongLinkedOpenHashMap<String> wordToMeta; public final Object2ByteOpenHashMap<String> wordToMeta;
public final HashMap<String, IntList> wordToPos; public final HashMap<String, IntList> wordToPos;
public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>();
/** These ware keywords that had signals of high relevance */ /** These ware keywords that had signals of high relevance */
public final Set<String> importantWords = new HashSet<>(); public final Set<String> importantWords = new HashSet<>();
@ -35,17 +38,17 @@ public class DocumentKeywordsBuilder {
} }
public DocumentKeywords build(ByteBuffer workArea) { public DocumentKeywords build(ByteBuffer workArea) {
final String[] wordArray = new String[wordToMeta.size()]; final List<String> wordArray = new ArrayList<>(wordToMeta.size());
final long[] meta = new long[wordToMeta.size()]; final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
final CodedSequence[] positions = new CodedSequence[wordToMeta.size()]; final List<CodedSequence> positions = new ArrayList<>(wordToMeta.size());
var iter = wordToMeta.object2LongEntrySet().fastIterator(); var iter = wordToMeta.object2ByteEntrySet().fastIterator();
for (int i = 0; iter.hasNext(); i++) { while (iter.hasNext()) {
var entry = iter.next(); var entry = iter.next();
meta[i] = entry.getLongValue(); meta.add(entry.getByteValue());
wordArray[i] = entry.getKey(); wordArray.add(entry.getKey());
var posList = wordToPos.getOrDefault(entry.getKey(), IntList.of()); var posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
@ -53,18 +56,33 @@ public class DocumentKeywordsBuilder {
posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear(); posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear();
} }
positions[i] = GammaCodedSequence.generate(workArea, posList); positions.add(GammaCodedSequence.generate(workArea, posList));
} }
return new DocumentKeywords(wordArray, meta, positions); // Encode spans
List<CodedWordSpan> spans = new ArrayList<>(wordSpans.size());
wordSpans.forEach((tag, spansForTag) -> {
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
var positionsForTag = new IntArrayList(spansForTag.size()*2);
for (var span : spansForTag) {
positionsForTag.add(span.start());
positionsForTag.add(span.end());
}
spans.add(new CodedWordSpan((byte) tag.charValue(), GammaCodedSequence.generate(workArea, positionsForTag)));
});
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
} }
public DocumentKeywordsBuilder(int capacity) { public DocumentKeywordsBuilder(int capacity) {
wordToMeta = new Object2LongLinkedOpenHashMap<>(capacity); wordToMeta = new Object2ByteOpenHashMap<>(capacity);
wordToPos = new HashMap<>(capacity); wordToPos = new HashMap<>(capacity);
} }
public void addMeta(String word, long meta) { public void addMeta(String word, byte meta) {
if (word.length() > MAX_WORD_LENGTH) if (word.length() > MAX_WORD_LENGTH)
return; return;
@ -84,12 +102,12 @@ public class DocumentKeywordsBuilder {
public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) { public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) {
flagWords.forEach(word -> flagWords.forEach(word ->
wordToMeta.mergeLong(word, flag.asBit(), (a, b) -> a|b) wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte)(a|b))
); );
} }
public void addAllSyntheticTerms(Collection<String> newWords) { public void addAllSyntheticTerms(Collection<String> newWords) {
long meta = WordFlags.Synthetic.asBit(); byte meta = WordFlags.Synthetic.asBit();
// Only add the synthetic flag if the words aren't already present // Only add the synthetic flag if the words aren't already present
@ -97,17 +115,17 @@ public class DocumentKeywordsBuilder {
} }
public void addAnchorTerms(Map<String, Integer> keywords) { public void addAnchorTerms(Map<String, Integer> keywords) {
long flagA = WordFlags.ExternalLink.asBit(); byte flagA = WordFlags.ExternalLink.asBit();
long flagB = flagA | WordFlags.Site.asBit(); byte flagB = (byte) (flagA | WordFlags.Site.asBit());
long flagC = flagB | WordFlags.SiteAdjacent.asBit(); byte flagC = (byte) (flagB | WordFlags.SiteAdjacent.asBit());
keywords.forEach((word, count) -> { keywords.forEach((word, count) -> {
if (count > 5) { if (count > 5) {
wordToMeta.mergeLong(word, flagC, (a, b) -> a|b); wordToMeta.mergeByte(word, flagC, (a, b) -> (byte) (a|b));
} else if (count > 2) { } else if (count > 2) {
wordToMeta.mergeLong(word, flagB, (a, b) -> a|b); wordToMeta.mergeByte(word, flagB, (a, b) -> (byte) (a|b));
} else { } else {
wordToMeta.mergeLong(word, flagA, (a, b) -> a|b); wordToMeta.mergeByte(word, flagA, (a, b) -> (byte) (a|b));
} }
}); });
} }
@ -115,9 +133,9 @@ public class DocumentKeywordsBuilder {
public List<String> getWordsWithAnyFlag(long flags) { public List<String> getWordsWithAnyFlag(long flags) {
List<String> ret = new ArrayList<>(); List<String> ret = new ArrayList<>();
for (var iter = wordToMeta.object2LongEntrySet().fastIterator(); iter.hasNext();) { for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext();) {
var entry = iter.next(); var entry = iter.next();
if ((flags & entry.getLongValue()) != 0) { if ((flags & entry.getByteValue()) != 0) {
ret.add(entry.getKey()); ret.add(entry.getKey());
} }
} }
@ -125,21 +143,27 @@ public class DocumentKeywordsBuilder {
return ret; return ret;
} }
public void addSpans(List<DocumentWordSpan> newSpans) {
for (var span : newSpans) {
wordSpans.computeIfAbsent(span.tag().code, k -> new ArrayList<>()).add(span);
}
}
public int size() { public int size() {
return Math.max(wordToMeta.size(), wordToPos.size()); return Math.max(wordToMeta.size(), wordToPos.size());
} }
public WordMetadata getMetaForWord(String word) {
return new WordMetadata(wordToMeta.getLong(word));
}
@Override @Override
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder("[ "); StringBuilder sb = new StringBuilder("[ ");
wordToMeta.forEach((word, meta) -> { wordToMeta.forEach((word, meta) -> {
sb.append(word).append("->").append(new WordMetadata(meta).flagSet()).append(',').append(wordToPos.getOrDefault(word, new IntArrayList())).append(' '); sb.append(word).append("->").append(WordFlags.decode(meta)).append(',').append(wordToPos.getOrDefault(word, new IntArrayList())).append(' ');
}); });
return sb.append(']').toString(); return sb.append(']').toString();
} }
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
}
} }

View File

@ -4,9 +4,8 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
@ -53,30 +52,11 @@ class DocumentKeywordExtractorTest {
keywords.getWordToMeta().forEach((k, v) -> { keywords.getWordToMeta().forEach((k, v) -> {
if (k.contains("_")) { if (k.contains("_")) {
System.out.println(k + " " + new WordMetadata(v)); System.out.println(k + " " + WordFlags.decode(v));
} }
}); });
} }
@Test
public void testKeyboards() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
System.out.println(keywords.getMetaForWord("mechanical"));
System.out.println(keywords.getMetaForWord("keyboard"));
System.out.println(keywords.getMetaForWord("keyboards"));
System.out.println(new WordMetadata(8894889328781L));
System.out.println(new WordMetadata(4294967297L));
System.out.println(new WordMetadata(566820053975498886L));
// -
System.out.println(new WordMetadata(1198298103937L));
System.out.println(new WordMetadata(1103808168065L));
}
@Test @Test
public void testMadonna() throws IOException, URISyntaxException { public void testMadonna() throws IOException, URISyntaxException {
@ -93,16 +73,17 @@ class DocumentKeywordExtractorTest {
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024)); var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
Map<String, WordMetadata> flags = new HashMap<>(); Map<String, Byte> flags = new HashMap<>();
Map<String, CodedSequence> positions = new HashMap<>(); Map<String, CodedSequence> positions = new HashMap<>();
for (int i = 0; i < keywordsBuilt.size(); i++) { for (int i = 0; i < keywordsBuilt.size(); i++) {
String keyword = keywordsBuilt.keywords[i]; String keyword = keywordsBuilt.keywords.get(i);
long metadata = keywordsBuilt.metadata[i]; byte metadata = keywordsBuilt.metadata[i]
;
if (Set.of("dirty", "blues").contains(keyword)) { if (Set.of("dirty", "blues").contains(keyword)) {
flags.put(keyword, new WordMetadata(metadata)); flags.put(keyword, metadata);
positions.put(keyword, keywordsBuilt.positions[i]); positions.put(keyword, keywordsBuilt.positions.get(i));
} }
} }
@ -127,7 +108,5 @@ class DocumentKeywordExtractorTest {
new TermFrequencyDict(WmsaHome.getLanguageModels())); new TermFrequencyDict(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));
System.out.println(keywords.getMetaForWord("knitting"));
} }
} }

View File

@ -1,6 +1,9 @@
package nu.marginalia.api.searchquery; package nu.marginalia.api.searchquery;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -11,9 +14,6 @@ import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import java.util.ArrayList; import java.util.ArrayList;
@ -197,7 +197,8 @@ public class QueryProtobufCodec {
return new SearchResultKeywordScore( return new SearchResultKeywordScore(
keywordScores.getKeyword(), keywordScores.getKeyword(),
-1, // termId is internal to index service -1, // termId is internal to index service
keywordScores.getEncodedWordMetadata() (byte) keywordScores.getFlags(),
keywordScores.getPositions()
); );
} }

View File

@ -1,40 +1,32 @@
package nu.marginalia.api.searchquery.model.results; package nu.marginalia.api.searchquery.model.results;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import java.util.Objects; import java.util.Objects;
public final class SearchResultKeywordScore { public final class SearchResultKeywordScore {
public final long termId; public final long termId;
public final String keyword; public final String keyword;
private final long encodedWordMetadata; public byte flags;
public int positionCount;
public SearchResultKeywordScore(String keyword, public SearchResultKeywordScore(String keyword,
long termId, long termId,
long encodedWordMetadata) { byte flags,
int positionCount) {
this.termId = termId; this.termId = termId;
this.keyword = keyword; this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
} }
public boolean hasTermFlag(WordFlags flag) { public boolean hasTermFlag(WordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); return (flags & flag.asBit()) != 0;
} }
public long positions() {
return WordMetadata.decodePositions(encodedWordMetadata);
}
public boolean isKeywordSpecial() { public boolean isKeywordSpecial() {
return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic); return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic);
} }
public long encodedWordMetadata() {
return encodedWordMetadata;
}
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
if (obj == this) return true; if (obj == this) return true;
@ -51,8 +43,7 @@ public final class SearchResultKeywordScore {
@Override @Override
public String toString() { public String toString() {
return "SearchResultKeywordScore[" + return "SearchResultKeywordScore[" +
"keyword=" + keyword + ", " + "keyword=" + keyword + ']';
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']';
} }
} }

View File

@ -108,7 +108,8 @@ message RpcRawResultItem {
/* Information about how well a keyword matches a query */ /* Information about how well a keyword matches a query */
message RpcResultKeywordScore { message RpcResultKeywordScore {
string keyword = 1; // the keyword string keyword = 1; // the keyword
int64 encodedWordMetadata = 2; // bit encoded word metadata int32 flags = 2;
int32 positions = 3;
} }
/* Query execution parameters */ /* Query execution parameters */

View File

@ -30,8 +30,9 @@ dependencies {
implementation project(':code:common:linkdb') implementation project(':code:common:linkdb')
implementation project(':code:common:service') implementation project(':code:common:service')
implementation project(':code:functions:search-query:api') implementation project(':code:processes:converting-process:model')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:index-forward') implementation project(':code:index:index-forward')
implementation project(':code:index:index-reverse') implementation project(':code:index:index-reverse')
implementation project(':code:index:query') implementation project(':code:index:query')
@ -73,4 +74,5 @@ dependencies {
testImplementation project(':code:libraries:test-helpers') testImplementation project(':code:libraries:test-helpers')
testImplementation project(':code:libraries:term-frequency-dict') testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:libraries:braille-block-punch-cards') testImplementation project(':code:libraries:braille-block-punch-cards')
testImplementation project(':code:libraries:test-helpers')
} }

View File

@ -15,11 +15,13 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies { dependencies {
implementation project(':code:libraries:array') implementation project(':code:libraries:array')
implementation project(':code:libraries:btree') implementation project(':code:libraries:btree')
implementation project(':code:libraries:slop')
implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:coded-sequence')
implementation project(':code:index:query') implementation project(':code:index:query')
implementation project(':code:index:index-journal') implementation project(':code:index:index-journal')
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:common:process') implementation project(':code:common:process')
implementation project(':code:processes:converting-process:model')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
@ -28,6 +30,7 @@ dependencies {
implementation libs.fastutil implementation libs.fastutil
implementation libs.trove implementation libs.trove
testImplementation project(':code:libraries:test-helpers')
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit
testImplementation libs.mockito testImplementation libs.mockito

View File

@ -1,19 +1,21 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.array.LongArray;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.slop.column.primitive.LongColumnReader;
import org.roaringbitmap.longlong.LongConsumer; import org.roaringbitmap.longlong.LongConsumer;
import org.roaringbitmap.longlong.Roaring64Bitmap; import org.roaringbitmap.longlong.Roaring64Bitmap;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -23,22 +25,25 @@ public class ForwardIndexConverter {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final IndexJournalReader journalReader;
private final Path outputFileDocsId; private final Path outputFileDocsId;
private final Path outputFileDocsData; private final Path outputFileDocsData;
private final DomainRankings domainRankings; private final DomainRankings domainRankings;
private final Path outputFileSpansData;
private final IndexJournal journal;
public ForwardIndexConverter(ProcessHeartbeat heartbeat, public ForwardIndexConverter(ProcessHeartbeat heartbeat,
IndexJournalReader journalReader,
Path outputFileDocsId, Path outputFileDocsId,
Path outputFileDocsData, Path outputFileDocsData,
Path outputFileSpansData,
IndexJournal journal,
DomainRankings domainRankings DomainRankings domainRankings
) { ) {
this.heartbeat = heartbeat; this.heartbeat = heartbeat;
this.journalReader = journalReader;
this.outputFileDocsId = outputFileDocsId; this.outputFileDocsId = outputFileDocsId;
this.outputFileDocsData = outputFileDocsData; this.outputFileDocsData = outputFileDocsData;
this.outputFileSpansData = outputFileSpansData;
this.journal = journal;
this.domainRankings = domainRankings; this.domainRankings = domainRankings;
} }
@ -58,7 +63,7 @@ public class ForwardIndexConverter {
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) { try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
progress.progress(TaskSteps.GET_DOC_IDS); progress.progress(TaskSteps.GET_DOC_IDS);
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); LongArray docsFileId = getDocIds(outputFileDocsId, journal);
progress.progress(TaskSteps.GATHER_OFFSETS); progress.progress(TaskSteps.GATHER_OFFSETS);
@ -73,20 +78,55 @@ public class ForwardIndexConverter {
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
var pointer = journalReader.newPointer(); ByteBuffer workArea = ByteBuffer.allocate(65536);
while (pointer.nextDocument()) { for (var instance : journal.pages()) {
long docId = pointer.documentId(); try (var docIdReader = instance.openCombinedId();
int domainId = UrlIdCodec.getDomainId(docId); var metaReader = instance.openDocumentMeta();
var featuresReader = instance.openFeatures();
var sizeReader = instance.openSize();
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId); var spansCodesReader = instance.openSpanCodes();
var spansSeqReader = instance.openSpans();
var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)
)
{
while (docIdReader.hasRemaining()) {
long docId = docIdReader.get();
int domainId = UrlIdCodec.getDomainId(docId);
int ranking = domainRankings.getRanking(domainId); long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId);
long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking);
long features = pointer.documentFeatures() | ((long) pointer.documentSize() << 32L); int ranking = domainRankings.getRanking(domainId);
long meta = DocumentMetadata.encodeRank(metaReader.get(), ranking);
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); final int docFeatures = featuresReader.get();
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features); final int docSize = sizeReader.get();
long features = docFeatures | ((long) docSize << 32L);
// Write spans data
byte[] spansCodes = spansCodesReader.get();
spansWriter.beginRecord(spansCodes.length);
for (int i = 0; i < spansCodes.length; i++) {
workArea.clear();
spansSeqReader.getData(workArea);
workArea.flip();
spansWriter.writeSpan(spansCodes[i], workArea);
}
long encodedSpansOffset = spansWriter.endRecord();
// Write the principal forward documents file
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features);
docFileData.set(entryOffset + ForwardIndexParameters.SPANS_OFFSET, encodedSpansOffset);
}
}
} }
progress.progress(TaskSteps.FORCE); progress.progress(TaskSteps.FORCE);
@ -104,9 +144,16 @@ public class ForwardIndexConverter {
} }
} }
private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException { private LongArray getDocIds(Path outputFileDocs, IndexJournal journalReader) throws IOException {
Roaring64Bitmap rbm = new Roaring64Bitmap(); Roaring64Bitmap rbm = new Roaring64Bitmap();
journalReader.forEachDocId(rbm::add);
for (var instance : journalReader.pages()) {
try (LongColumnReader idReader = instance.openCombinedId()) {
while (idReader.hasRemaining()) {
rbm.add(idReader.get());
}
}
}
LongArray ret = LongArrayFactory.mmapForWritingConfined(outputFileDocs, rbm.getIntCardinality()); LongArray ret = LongArrayFactory.mmapForWritingConfined(outputFileDocs, rbm.getIntCardinality());
rbm.forEach(new LongConsumer() { rbm.forEach(new LongConsumer() {

View File

@ -13,6 +13,10 @@ public class ForwardIndexFileNames {
case NEXT -> basePath.resolve("fwd-doc-data.dat.next"); case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
case CURRENT -> basePath.resolve("fwd-doc-data.dat"); case CURRENT -> basePath.resolve("fwd-doc-data.dat");
}; };
case SPANS_DATA -> switch (version) {
case NEXT -> basePath.resolve("fwd-spans.dat.next");
case CURRENT -> basePath.resolve("fwd-spans.dat");
};
}; };
} }
@ -23,6 +27,7 @@ public class ForwardIndexFileNames {
public enum FileIdentifier { public enum FileIdentifier {
DOC_DATA, DOC_DATA,
SPANS_DATA,
DOC_ID DOC_ID
} }
} }

View File

@ -1,8 +1,8 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward;
class ForwardIndexParameters { class ForwardIndexParameters {
public static final int ENTRY_SIZE = 2; public static final int ENTRY_SIZE = 3;
public static final int METADATA_OFFSET = 0; public static final int METADATA_OFFSET = 0;
public static final int FEATURES_OFFSET = 1; public static final int FEATURES_OFFSET = 1;
public static final int SPANS_OFFSET = 2;
} }

View File

@ -29,19 +29,31 @@ public class ForwardIndexReader {
private final TLongIntHashMap idToOffset; private final TLongIntHashMap idToOffset;
private final LongArray data; private final LongArray data;
private final ForwardIndexSpansReader spansReader;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
public ForwardIndexReader(Path idsFile, Path dataFile) throws IOException { public ForwardIndexReader(Path idsFile,
Path dataFile,
Path spansFile) throws IOException {
if (!Files.exists(dataFile)) { if (!Files.exists(dataFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile); logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile);
idToOffset = null; idToOffset = null;
data = null; data = null;
spansReader = null;
return; return;
} }
else if (!Files.exists(idsFile)) { else if (!Files.exists(idsFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile); logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile);
idToOffset = null; idToOffset = null;
data = null; data = null;
spansReader = null;
return;
}
else if (!Files.exists(spansFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile);
idToOffset = null;
data = null;
spansReader = null;
return; return;
} }
@ -49,6 +61,7 @@ public class ForwardIndexReader {
idToOffset = loadIds(idsFile); idToOffset = loadIds(idsFile);
data = loadData(dataFile); data = loadData(dataFile);
spansReader = new ForwardIndexSpansReader(spansFile);
} }
private static TLongIntHashMap loadIds(Path idsFile) throws IOException { private static TLongIntHashMap loadIds(Path idsFile) throws IOException {

View File

@ -0,0 +1,63 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.List;
@SuppressWarnings("preview")
public class ForwardIndexSpansReader implements AutoCloseable {
private final FileChannel spansFileChannel;
public ForwardIndexSpansReader(Path spansFile) throws IOException {
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
}
public List<SpanData> readSpans(Arena arena, long encodedOffset) throws IOException {
long size = encodedOffset & 0xFFF_FFFF;
long offset = encodedOffset >>> 28;
var buffer = arena.allocate(size).asByteBuffer();
buffer.clear();
while (buffer.hasRemaining()) {
spansFileChannel.read(buffer, offset + buffer.position());
}
buffer.flip();
int count = buffer.get();
List<SpanData> ret = new ArrayList<>();
while (count-- > 0) {
byte code = buffer.get();
short len = buffer.getShort();
final int pos = buffer.position();
// Decode the gamma-coded sequence; this will advance the buffer position
// in a not entirely predictable way, so we need to save the position
buffer.limit(buffer.position() + len);
var sequence = new GammaCodedSequence(buffer).values();
ret.add(new SpanData(code, sequence));
// Reset the buffer position to the end of the span
buffer.position(pos + len);
buffer.limit(buffer.capacity());
}
return ret;
}
@Override
public void close() throws IOException {
spansFileChannel.close();
}
public record SpanData(byte code, IntList data) {}
}

View File

@ -0,0 +1,53 @@
package nu.marginalia.index.forward;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class ForwardIndexSpansWriter implements AutoCloseable {
private final FileChannel outputChannel;
private final ByteBuffer work = ByteBuffer.allocate(32);
private long stateStartOffset = -1;
private int stateLength = -1;
public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException {
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
}
public void beginRecord(int count) throws IOException {
stateStartOffset = outputChannel.position();
stateLength = 0;
work.clear();
work.put((byte) count);
work.flip();
while (work.hasRemaining())
stateLength += outputChannel.write(work);
}
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
work.clear();
work.put(spanCode);
work.putShort((short) sequenceData.remaining());
work.flip();
while (work.hasRemaining() || sequenceData.hasRemaining()) {
stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData});
}
}
public long endRecord() {
return stateStartOffset << 28 | stateLength;
}
@Override
public void close() throws IOException {
outputChannel.close();
}
}

View File

@ -2,15 +2,11 @@ package nu.marginalia.index.forward;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.test.TestUtil; import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
@ -21,85 +17,94 @@ import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.stream.IntStream; import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
class ForwardIndexConverterTest { class ForwardIndexConverterTest {
IndexJournalWriter writer; IndexJournalSlopWriter writer;
Path indexFile;
Path wordsFile1; Path wordsFile1;
Path urlsFile1; Path urlsFile1;
Path dictionaryFile; Path dictionaryFile;
Path workDir;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
Path dataDir; Path dataDir;
private Path docsFileId; private Path docsFileId;
private Path docsFileData; private Path docsFileData;
private Path docsSpanData;
int workSetSize = 512; int workSetSize = 512;
@BeforeEach @BeforeEach
@SneakyThrows @SneakyThrows
void setUp() { void setUp() {
workDir = Files.createTempDirectory(getClass().getSimpleName());
dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit(); dictionaryFile.toFile().deleteOnExit();
indexFile = Files.createTempFile("tmp", ".idx");
indexFile.toFile().deleteOnExit();
writer = new IndexJournalWriterSingleFileImpl(indexFile);
wordsFile1 = Files.createTempFile("words1", ".idx"); wordsFile1 = Files.createTempFile("words1", ".idx");
urlsFile1 = Files.createTempFile("urls1", ".idx"); urlsFile1 = Files.createTempFile("urls1", ".idx");
dataDir = Files.createTempDirectory(getClass().getSimpleName()); dataDir = Files.createTempDirectory(getClass().getSimpleName());
for (int i = 1; i < workSetSize; i++) { try (var writer = new IndexJournalSlopWriter(IndexJournal.allocateName(workDir), 0)) {
createEntry(writer, i); for (int i = 1; i < workSetSize; i++) {
createEntry(writer, i);
}
} }
writer.close();
docsFileId = dataDir.resolve("docs-i.dat"); docsFileId = dataDir.resolve("docs-i.dat");
docsFileData = dataDir.resolve("docs-d.dat"); docsFileData = dataDir.resolve("docs-d.dat");
docsSpanData = dataDir.resolve("docs-s.dat");
} }
@AfterEach @AfterEach
public void tearDown() { public void tearDown() {
TestUtil.clearTempDir(dataDir); TestUtil.clearTempDir(dataDir);
TestUtil.clearTempDir(workDir);
} }
long createId(long url, long domain) { long createId(long url, long domain) {
return UrlIdCodec.encodeId((int) domain, (int) url); return UrlIdCodec.encodeId((int) domain, (int) url);
} }
public void createEntry(IndexJournalWriter writer, int id) { public void createEntry(IndexJournalSlopWriter writer, int id) {
writer.put( writer.put(
new IndexJournalEntryHeader(createId(id, id/20), createId(id, id/20),
new SlopDocumentRecord.KeywordsProjection(
"",
-1,
id%3, id%3,
id%5,
15, 15,
(id % 5)), List.of(),
new IndexJournalEntryData( new byte[0],
new String[]{}, List.of(),
new long[]{}, new byte[0],
new CodedSequence[]{} List.of()
) )
); );
} }
@Test @Test
void testForwardIndex() throws IOException { void testForwardIndex() throws IOException {
new ForwardIndexConverter(new FakeProcessHeartbeat(), new ForwardIndexConverter(new FakeProcessHeartbeat(),
new IndexJournalReaderSingleFile(indexFile),
docsFileId, docsFileId,
docsFileData, docsFileData,
docsSpanData,
IndexJournal.findJournal(workDir).orElseThrow(),
new DomainRankings()).convert(); new DomainRankings()).convert();
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); var forwardReader = new ForwardIndexReader(docsFileId, docsFileData, docsSpanData);
for (int i = 36; i < workSetSize; i++) { for (int i = 36; i < workSetSize; i++) {
long docId = createId(i, i/20); long docId = createId(i, i/20);
@ -108,5 +113,4 @@ class ForwardIndexConverterTest {
assertEquals(i/20, UrlIdCodec.getDomainId(docId)); assertEquals(i/20, UrlIdCodec.getDomainId(docId));
} }
} }
} }

View File

@ -0,0 +1,63 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.assertEquals;
class ForwardIndexSpansReaderTest {
Path testFile = Files.createTempFile("test", ".idx");
ForwardIndexSpansReaderTest() throws IOException {
}
@AfterEach
public void tearDown() throws IOException {
Files.deleteIfExists(testFile);
}
@Test
void testSunnyDay() throws IOException {
ByteBuffer wa = ByteBuffer.allocate(32);
long offset1;
long offset2;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan((byte) 'a', GammaCodedSequence.generate(wa, 1, 3, 5).buffer());
offset1 = writer.endRecord();
writer.beginRecord(2);
writer.writeSpan((byte) 'b', GammaCodedSequence.generate(wa, 2, 4, 6).buffer());
writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 3, 5, 7).buffer());
offset2 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);
var spans2 = reader.readSpans(arena, offset2);
assertEquals(1, spans1.size());
assertEquals('a', spans1.get(0).code());
assertEquals(IntList.of(1, 3, 5), spans1.get(0).data());
assertEquals(2, spans2.size());
assertEquals('b', spans2.get(0).code());
assertEquals(IntList.of(2, 4, 6), spans2.get(0).data());
assertEquals('c', spans2.get(1).code());
assertEquals(IntList.of(3, 5, 7), spans2.get(1).data());
}
}
}

View File

@ -1,43 +0,0 @@
package nu.marginalia.test;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
public class TestUtil {
public static void clearTempDir(Path dir) {
if (Files.isDirectory(dir)) {
for (File f : dir.toFile().listFiles()) {
File[] files = f.listFiles();
if (files != null) {
Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
}
System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")");
f.delete();
}
}
System.out.println("Deleting " + dir);
dir.toFile().delete();
}
private static String fileSize(Path path) {
try {
long sizeBytes = Files.size(path);
if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
return sizeBytes + "b";
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
private static String round(double d) {
return String.format("%.2f", d);
}
}

View File

@ -15,7 +15,9 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies { dependencies {
implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:array') implementation project(':code:libraries:array')
implementation project(':code:libraries:slop')
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:processes:converting-process:model')
implementation project(':third-party:parquet-floor') implementation project(':third-party:parquet-floor')
implementation project(':third-party:commons-codec') implementation project(':third-party:commons-codec')

View File

@ -0,0 +1,53 @@
package nu.marginalia.index.journal;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
public record IndexJournal(Path journalDir) {
public static final String JOURNAL_FILE_NAME = "index-journal";
public static Path allocateName(Path base) {
return base.resolve(JOURNAL_FILE_NAME);
}
/** Returns the journal file in the base directory. */
public static Optional<IndexJournal> findJournal(Path baseDirectory) {
Path journal = baseDirectory.resolve(JOURNAL_FILE_NAME);
if (Files.isDirectory(journal)) {
return Optional.of(new IndexJournal(journal));
}
return Optional.empty();
}
/** Returns the number of versions of the journal file in the base directory. */
public static int numPages(Path baseDirectory) {
for (int version = 0; ; version++) {
if (!IndexJournalPage.combinedId.forPage(version).exists(baseDirectory)) {
return version;
}
}
}
public IndexJournal {
if (!journalDir.toFile().isDirectory()) {
throw new IllegalArgumentException("Invalid journal directory: " + journalDir);
}
}
public List<IndexJournalPage> pages() {
int pages = numPages(journalDir);
List<IndexJournalPage> instances = new ArrayList<>(pages);
for (int version = 0; version < pages; version++) {
instances.add(new IndexJournalPage(journalDir, version));
}
return instances;
}
}

View File

@ -1,30 +0,0 @@
package nu.marginalia.index.journal;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public class IndexJournalFileNames {
public static Path allocateName(Path base, int idx) {
return base.resolve(String.format("page-index-%04d.dat", idx));
}
public static List<Path> findJournalFiles(Path baseDirectory) throws IOException {
List<Path> ret = new ArrayList<>();
try (var listStream = Files.list(baseDirectory)) {
listStream
.filter(IndexJournalFileNames::isJournalFile)
.sorted()
.forEach(ret::add);
}
return ret;
}
public static boolean isJournalFile(Path file) {
return file.toFile().getName().matches("page-index-\\d{4}.dat");
}
}

View File

@ -0,0 +1,76 @@
package nu.marginalia.index.journal;
import nu.marginalia.slop.column.array.ByteArrayColumnReader;
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
import nu.marginalia.slop.column.dynamic.GammaCodedSequenceReader;
import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter;
import nu.marginalia.slop.column.dynamic.VarintColumnReader;
import nu.marginalia.slop.column.dynamic.VarintColumnWriter;
import nu.marginalia.slop.column.primitive.*;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.ColumnType;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
import java.nio.file.Path;
public record IndexJournalPage(Path baseDir, int page) {
public static final ColumnDesc<IntColumnReader, IntColumnWriter> features = new ColumnDesc<>("features", ColumnType.INT_LE, StorageType.PLAIN);
public static final ColumnDesc<IntColumnReader, IntColumnWriter> size = new ColumnDesc<>("size", ColumnType.INT_LE, StorageType.PLAIN);
public static final ColumnDesc<LongColumnReader, LongColumnWriter> combinedId = new ColumnDesc<>("combinedId", ColumnType.LONG_LE, StorageType.PLAIN);
public static final ColumnDesc<LongColumnReader, LongColumnWriter> documentMeta = new ColumnDesc<>("documentMeta", ColumnType.LONG_LE, StorageType.PLAIN);
public static final ColumnDesc<VarintColumnReader, VarintColumnWriter> termCounts = new ColumnDesc<>("termCounts", ColumnType.VARINT_LE, StorageType.PLAIN);
public static final ColumnDesc<LongColumnReader, LongColumnWriter> termIds = new ColumnDesc<>("termIds", ColumnType.LONG_LE, StorageType.ZSTD);
public static final ColumnDesc<ByteColumnReader, ByteColumnWriter> termMeta = new ColumnDesc<>("termMetadata", ColumnType.BYTE, StorageType.ZSTD);
public static final ColumnDesc<GammaCodedSequenceReader, GammaCodedSequenceWriter> positions = new ColumnDesc<>("termPositions", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD);
public static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> spanCodes = new ColumnDesc<>("spanCodes", ColumnType.BYTE_ARRAY, StorageType.ZSTD);
public static final ColumnDesc<GammaCodedSequenceReader, GammaCodedSequenceWriter> spans = new ColumnDesc<>("spans", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD);
public IndexJournalPage {
if (!baseDir.toFile().isDirectory()) {
throw new IllegalArgumentException("Invalid base directory: " + baseDir);
}
}
public LongColumnReader openCombinedId() throws IOException {
return combinedId.forPage(page).open(baseDir);
}
public LongColumnReader openDocumentMeta() throws IOException {
return documentMeta.forPage(page).open(baseDir);
}
public IntColumnReader openFeatures() throws IOException {
return features.forPage(page).open(baseDir);
}
public IntColumnReader openSize() throws IOException {
return size.forPage(page).open(baseDir);
}
public LongColumnReader openTermCounts() throws IOException {
return termCounts.forPage(page).open(baseDir);
}
public LongColumnReader openTermIds() throws IOException {
return termIds.forPage(page).open(baseDir);
}
public ByteColumnReader openTermMetadata() throws IOException {
return termMeta.forPage(page).open(baseDir);
}
public GammaCodedSequenceReader openTermPositions() throws IOException {
return positions.forPage(page).open(baseDir);
}
public GammaCodedSequenceReader openSpans() throws IOException {
return spans.forPage(page).open(baseDir);
}
public ByteArrayColumnReader openSpanCodes() throws IOException {
return spanCodes.forPage(page).open(baseDir);
}
}

View File

@ -0,0 +1,105 @@
package nu.marginalia.index.journal;
import lombok.SneakyThrows;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter;
import nu.marginalia.slop.column.primitive.ByteColumnWriter;
import nu.marginalia.slop.column.primitive.IntColumnWriter;
import nu.marginalia.slop.column.primitive.LongColumnWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
public class IndexJournalSlopWriter implements AutoCloseable {
private final IntColumnWriter featuresWriter;
private final IntColumnWriter sizeWriter;
private final LongColumnWriter combinedIdWriter;
private final LongColumnWriter documentMetaWriter;
private final LongColumnWriter termCountsWriter;
private final LongColumnWriter termIdsWriter;
private final ByteColumnWriter termMetadataWriter;
private final GammaCodedSequenceWriter termPositionsWriter;
private final GammaCodedSequenceWriter spansWriter;
private final ByteArrayColumnWriter spanCodesWriter;
private static final MurmurHash3_128 hash = new MurmurHash3_128();
public IndexJournalSlopWriter(Path dir, int page) throws IOException {
if (!Files.exists(dir)) {
Files.createDirectory(dir);
}
featuresWriter = IndexJournalPage.features.forPage(page).create(dir);
sizeWriter = IndexJournalPage.size.forPage(page).create(dir);
combinedIdWriter = IndexJournalPage.combinedId.forPage(page).create(dir);
documentMetaWriter = IndexJournalPage.documentMeta.forPage(page).create(dir);
termCountsWriter = IndexJournalPage.termCounts.forPage(page).create(dir);
termIdsWriter = IndexJournalPage.termIds.forPage(page).create(dir);
termMetadataWriter = IndexJournalPage.termMeta.forPage(page).create(dir);
termPositionsWriter = IndexJournalPage.positions.forPage(page).create(dir);
spansWriter = IndexJournalPage.spans.forPage(page).create(dir);
spanCodesWriter = IndexJournalPage.spanCodes.forPage(page).create(dir);
}
@SneakyThrows
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) {
combinedIdWriter.put(combinedId);
featuresWriter.put(keywordsProjection.htmlFeatures());
sizeWriter.put(keywordsProjection.length());
documentMetaWriter.put(keywordsProjection.documentMetadata());
// -- write keyword data --
final List<String> keywords = keywordsProjection.words();
byte[] termMetadata = keywordsProjection.metas();
termCountsWriter.put(keywords.size());
// termIds are the special hashes of the keywords
long[] termIds = new long[keywordsProjection.words().size()];
for (int i = 0; i < termIds.length; i++) {
termIds[i] = hash.hashKeyword(keywords.get(i));
}
List<CodedSequence> termPositions = keywordsProjection.positions();
for (int i = 0; i < termMetadata.length; i++) {
termMetadataWriter.put(termMetadata[i]);
termIdsWriter.put(termIds[i]);
termPositionsWriter.put((GammaCodedSequence) termPositions.get(i));
}
// -- write spans --
spanCodesWriter.put(keywordsProjection.spanCodes());
for (var span : keywordsProjection.spans()) {
spansWriter.put((GammaCodedSequence) span);
}
}
public void close() throws IOException {
featuresWriter.close();
sizeWriter.close();
combinedIdWriter.close();
documentMetaWriter.close();
termCountsWriter.close();
termIdsWriter.close();
termMetadataWriter.close();
termPositionsWriter.close();
spansWriter.close();
spanCodesWriter.close();
}
}

View File

@ -1,36 +0,0 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.sequence.CodedSequence;
public record IndexJournalEntryData(long[] termIds,
long[] metadata,
CodedSequence[] positions) {
public IndexJournalEntryData {
assert termIds.length == metadata.length;
assert termIds.length == positions.length;
}
public IndexJournalEntryData(String[] keywords,
long[] metadata,
CodedSequence[] positions)
{
this(termIds(keywords), metadata, positions);
}
private static final MurmurHash3_128 hash = new MurmurHash3_128();
public int size() {
return termIds.length;
}
private static long[] termIds(String[] keywords) {
long[] termIds = new long[keywords.length];
for (int i = 0; i < keywords.length; i++) {
termIds[i] = hash.hashKeyword(keywords[i]);
}
return termIds;
}
}

View File

@ -1,35 +0,0 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
/** The header of an index journal entry.
*
* @param entrySize the size of the entry
* @param documentFeatures the features of the document, as an encoded HtmlFeature
* @param combinedId the combined document id, encoded with UrlIdCodec
* @param documentMeta the metadata of the document, as an encoded DocumentMetadata
*
* @see DocumentMetadata
* @see HtmlFeature
* @see UrlIdCodec
*/
public record IndexJournalEntryHeader(int entrySize,
int documentFeatures,
int documentSize,
long combinedId,
long documentMeta) {
public IndexJournalEntryHeader(long combinedId,
int documentFeatures,
int documentSize,
long documentMeta) {
this(-1,
documentFeatures,
documentSize,
combinedId,
documentMeta);
}
}

View File

@ -1,25 +0,0 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import java.nio.ByteBuffer;
/** Data corresponding to a term in a document in the index journal.
*
* @param termId the id of the term
* @param metadata the metadata of the term
* @param positionsBuffer buffer holding positions of the word in the document, gamma coded
*
* @see GammaCodedSequence
*/
public record IndexJournalEntryTermData(
long termId,
long metadata,
ByteBuffer positionsBuffer)
{
public CodedSequence positions() {
return new GammaCodedSequence(positionsBuffer);
}
}

View File

@ -1,10 +0,0 @@
package nu.marginalia.index.journal.model;
/** The header of an index journal file. This is the first 16 bytes of the file,
* and is not compressed.
*
* @param fileSizeRecords the size of the file in number of records
* @param reserved should be 0
*/
public record IndexJournalFileHeader(long fileSizeRecords, long reserved) {
}

View File

@ -1,111 +0,0 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.model.id.UrlIdCodec;
import java.io.DataInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Iterator;
public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData> {
public final IndexJournalEntryHeader header;
private final ByteBuffer buffer;
private final int initialPos;
public IndexJournalReadEntry(IndexJournalEntryHeader header, ByteBuffer buffer) {
this.header = header;
this.buffer = buffer;
this.initialPos = buffer.position();
}
public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException {
final int entrySize = (inputStream.readShort() & 0xFFFF);
final int docSize = inputStream.readShort();
final int docFeatures = inputStream.readInt();
final long docId = inputStream.readLong();
final long meta = inputStream.readLong();
var header = new IndexJournalEntryHeader(
entrySize,
docFeatures,
docSize,
docId,
meta);
byte[] buffer = new byte[entrySize];
inputStream.readFully(buffer);
return new IndexJournalReadEntry(header, ByteBuffer.wrap(buffer));
}
public long docId() {
return header.combinedId();
}
public long docMeta() {
return header.documentMeta();
}
public int documentFeatures() {
return header.documentFeatures();
}
public int documentSize() {
return header.documentSize();
}
public int domainId() {
return UrlIdCodec.getDomainId(docId());
}
public void reset() {
buffer.position(initialPos);
}
public Iterator<IndexJournalEntryTermData> iterator() {
return new TermDataIterator(buffer, initialPos);
}
}
class TermDataIterator implements Iterator<IndexJournalEntryTermData> {
private final ByteBuffer buffer;
// Pointer alias to buffer, used to reduce slice() allocation overhead in the iterator
private final ByteBuffer alias;
TermDataIterator(ByteBuffer buffer, int initialPos) {
this.buffer = buffer;
this.buffer.position(initialPos);
this.alias = buffer.duplicate();
}
@Override
public boolean hasNext() {
return buffer.position() < buffer.limit();
}
@Override
public IndexJournalEntryTermData next() {
// read the metadata for the term
long termId = buffer.getLong();
long meta = buffer.getShort();
// read the size of the sequence data
int size = buffer.getShort() & 0xFFFF;
// position the alias buffer to the term data
alias.limit(buffer.position() + size);
alias.position(buffer.position());
// advance the buffer position to the next term
buffer.position(buffer.position() + size);
return new IndexJournalEntryTermData(termId, meta, alias);
}
}

View File

@ -1,73 +0,0 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import java.io.IOException;
import java.nio.file.Path;
import java.util.function.LongConsumer;
import java.util.function.LongPredicate;
/** Tools for reading the index journal. */
public interface IndexJournalReader {
int FILE_HEADER_SIZE_LONGS = 2;
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
int DOCUMENT_HEADER_SIZE_BYTES = 24;
int TERM_HEADER_SIZE_BYTES = 12;
/** Create a reader for a single file. */
static IndexJournalReader singleFile(Path fileName) throws IOException {
return new IndexJournalReaderSingleFile(fileName);
}
/** Create a reader for a set of files. */
static IndexJournalReader paging(Path baseDir) throws IOException {
return new IndexJournalReaderPagingImpl(baseDir);
}
default void forEachWordId(LongConsumer consumer) {
var ptr = this.newPointer();
while (ptr.nextDocument()) {
for (var termData : ptr) {
consumer.accept(termData.termId());
}
}
}
default void forEachDocId(LongConsumer consumer) throws IOException {
try (var ptr = this.newPointer()) {
while (ptr.nextDocument()) {
consumer.accept(ptr.documentId());
}
}
}
/** Create a new pointer to the journal. The IndexJournalPointer is
* a two-tiered iterator that allows both iteration over document records
* and the terms within each document.
*/
IndexJournalPointer newPointer();
/** Reader that filters the entries based on the term metadata. */
default IndexJournalReader filtering(LongPredicate termMetaFilter) {
return new FilteringIndexJournalReader(this, termMetaFilter);
}
}
class FilteringIndexJournalReader implements IndexJournalReader {
private final IndexJournalReader base;
private final LongPredicate termMetaFilter;
FilteringIndexJournalReader(IndexJournalReader base, LongPredicate termMetaFilter) {
this.base = base;
this.termMetaFilter = termMetaFilter;
}
@Override
public IndexJournalPointer newPointer() {
return base
.newPointer()
.filterWordMeta(termMetaFilter);
}
}

View File

@ -1,43 +0,0 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public class IndexJournalReaderPagingImpl implements IndexJournalReader {
private static final Logger logger = LoggerFactory.getLogger(IndexJournalReaderPagingImpl.class);
private final List<IndexJournalReader> readers;
public IndexJournalReaderPagingImpl(Path baseDir) throws IOException {
this(IndexJournalFileNames.findJournalFiles(baseDir));
if (readers.isEmpty())
logger.warn("Creating paging index journal file in {}, found no inputs!", baseDir);
else
logger.info("Creating paging index journal reader for {} inputs", readers.size());
}
public IndexJournalReaderPagingImpl(List<Path> inputFiles) throws IOException {
this.readers = new ArrayList<>(inputFiles.size());
for (var inputFile : inputFiles) {
readers.add(new IndexJournalReaderSingleFile(inputFile));
}
}
@Override
public IndexJournalPointer newPointer() {
return IndexJournalPointer.concatenate(
readers.stream()
.map(IndexJournalReader::newPointer)
.toArray(IndexJournalPointer[]::new)
);
}
}

View File

@ -1,116 +0,0 @@
package nu.marginalia.index.journal.reader;
import com.github.luben.zstd.ZstdInputStream;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.index.journal.model.IndexJournalFileHeader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import org.jetbrains.annotations.NotNull;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Iterator;
public class IndexJournalReaderSingleFile implements IndexJournalReader {
private final Path journalFile;
public final IndexJournalFileHeader fileHeader;
@Override
public String toString() {
return "IndexJournalReaderSingleCompressedFile{" + journalFile + " }";
}
public IndexJournalReaderSingleFile(Path file) throws IOException {
this.journalFile = file;
fileHeader = readHeader(file);
}
private static IndexJournalFileHeader readHeader(Path file) throws IOException {
try (var raf = new RandomAccessFile(file.toFile(), "r")) {
long recordCount = raf.readLong();
long unused = raf.readLong();
return new IndexJournalFileHeader(recordCount, unused);
}
}
private static DataInputStream createInputStream(Path file) throws IOException {
var fileInputStream = Files.newInputStream(file, StandardOpenOption.READ);
// skip the header
fileInputStream.skipNBytes(16);
return new DataInputStream(new ZstdInputStream(new BufferedInputStream(fileInputStream)));
}
@SneakyThrows
@Override
public IndexJournalPointer newPointer() {
return new SingleFileJournalPointer(fileHeader, createInputStream(journalFile));
}
}
class SingleFileJournalPointer implements IndexJournalPointer {
private final IndexJournalFileHeader fileHeader;
private final DataInputStream dataInputStream;
private IndexJournalReadEntry entry;
private int docIdx = -1;
public SingleFileJournalPointer(
IndexJournalFileHeader fileHeader,
DataInputStream dataInputStream)
{
this.fileHeader = fileHeader;
this.dataInputStream = dataInputStream;
}
@SneakyThrows
@Override
public boolean nextDocument() {
if (++docIdx < fileHeader.fileSizeRecords()) {
entry = IndexJournalReadEntry.read(dataInputStream);
return true;
}
dataInputStream.close();
return false;
}
@Override
public long documentId() {
return entry.docId();
}
@Override
public long documentMeta() {
return entry.docMeta();
}
@Override
public int documentFeatures() { return entry.documentFeatures(); }
@Override
public int documentSize() { return entry.documentSize(); }
/** Return an iterator over the terms in the current document.
* This iterator is not valid after calling nextDocument().
*/
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
return entry.iterator();
}
@Override
public void close() throws IOException {
dataInputStream.close();
}
}

View File

@ -1,202 +0,0 @@
package nu.marginalia.index.journal.reader.pointer;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.util.Iterator;
import java.util.function.LongPredicate;
/**
* This is something like a double iterator. The Index Journal consists of
* blocks of words and word-metadata for each document and document metadata.
* <br>
*
* Perhaps best conceptualized as something like
*
* <pre>[doc1: word1 word2 word3 word4] [doc2: word1 word2 word3 ]</pre>
* nextDocument() will move the pointer from doc1 to doc2;<br>
* nextRecord() will move the pointer from word1 to word2...<br>
*/
public interface IndexJournalPointer extends Iterable<IndexJournalEntryTermData>, AutoCloseable {
/**
* Advance to the next document in the journal,
* returning true if such a document exists.
* Resets the record index to before the first
* record (if it exists).
*/
boolean nextDocument();
/**
* Get the id associated with the current document
*/
long documentId();
/**
* Get the metadata associated with the current document
*/
long documentMeta();
/**
* Get the documentFeatures associated with the current record
*/
int documentFeatures();
int documentSize();
/** Concatenate a number of journal pointers */
static IndexJournalPointer concatenate(IndexJournalPointer... pointers) {
if (pointers.length == 1)
return pointers[0];
return new JoiningJournalPointer(pointers);
}
/** Add a filter on word metadata to the pointer */
default IndexJournalPointer filterWordMeta(LongPredicate filter) {
return new FilteringJournalPointer(this, filter);
}
void close() throws IOException;
}
class JoiningJournalPointer implements IndexJournalPointer {
private final IndexJournalPointer[] pointers;
private int pIndex = 0;
JoiningJournalPointer(IndexJournalPointer[] pointers) {
this.pointers = pointers;
}
@Override
public boolean nextDocument() {
while (pIndex < pointers.length) {
if (pointers[pIndex].nextDocument())
return true;
else pIndex++;
}
return false;
}
@Override
public long documentId() {
return pointers[pIndex].documentId();
}
@Override
public long documentMeta() {
return pointers[pIndex].documentMeta();
}
@Override
public int documentFeatures() {
return pointers[pIndex].documentFeatures();
}
@Override
public int documentSize() {
return pointers[pIndex].documentSize();
}
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
return pointers[pIndex].iterator();
}
public void close() {
for (var p : pointers) {
try {
p.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
class FilteringJournalPointer implements IndexJournalPointer {
private final IndexJournalPointer base;
private final LongPredicate filter;
FilteringJournalPointer(IndexJournalPointer base, LongPredicate filter) {
this.base = base;
this.filter = filter;
}
@Override
public boolean nextDocument() {
while (base.nextDocument()) {
if (iterator().hasNext()) {
return true;
}
}
return false;
}
@Override
public long documentId() {
return base.documentId();
}
@Override
public long documentMeta() {
return base.documentMeta();
}
@Override
public int documentFeatures() {
return base.documentFeatures();
}
@Override
public int documentSize() {
return base.documentSize();
}
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
return new Iterator<>() {
private final Iterator<IndexJournalEntryTermData> baseIter = base.iterator();
private IndexJournalEntryTermData value = null;
@Override
public boolean hasNext() {
if (value != null) {
return true;
}
while (baseIter.hasNext()) {
value = baseIter.next();
if (filter.test(value.metadata())) {
return true;
}
}
value = null;
return false;
}
@Override
public IndexJournalEntryTermData next() {
if (hasNext()) {
var ret = value;
value = null;
return ret;
} else {
throw new IllegalStateException("No more elements");
}
}
};
}
@Override
public void close() throws IOException {
base.close();
}
}

View File

@ -1,17 +0,0 @@
package nu.marginalia.index.journal.writer;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import java.io.IOException;
/** Responsible for writing to the index journal.
* <p></p>
* @see IndexJournalWriterSingleFileImpl
* @see IndexJournalWriterPagingImpl
*/
public interface IndexJournalWriter extends AutoCloseable {
void close() throws IOException;
int put(IndexJournalEntryHeader header, IndexJournalEntryData data);
}

View File

@ -1,68 +0,0 @@
package nu.marginalia.index.journal.writer;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
/** IndexJournalWriter implementation that creates a sequence of journal files,
* delegating to IndexJournalWriterSingleFileImpl to write the individual files.
*
*/
public class IndexJournalWriterPagingImpl implements IndexJournalWriter {
private final Path outputDir;
private int fileNumber = 0;
/** The maximum size of a journal file, in uncompressed bytes.
* This should be safely below 2 GB, since we assume in the construction
* of the index that this is the case! The smaller these files are, the
* slower the index construction will be, but at the same time, if 2 GB
* is exceeded, the index construction will *quietly* fail.
*
* Flap flap, Icarus!
*/
private static final long sizeLimitBytes = 1_000_000_000; // 1 GB
private final Logger logger = LoggerFactory.getLogger(getClass());
private IndexJournalWriter currentWriter = null;
private long bytesWritten = 0;
public IndexJournalWriterPagingImpl(Path outputDir) throws IOException {
this.outputDir = outputDir;
switchToNextWriter();
logger.info("Creating Journal Writer {}", outputDir);
}
private void switchToNextWriter() throws IOException {
if (currentWriter != null)
currentWriter.close();
currentWriter = new IndexJournalWriterSingleFileImpl(IndexJournalFileNames.allocateName(outputDir, fileNumber++));
}
@Override
@SneakyThrows
public int put(IndexJournalEntryHeader header, IndexJournalEntryData data)
{
if (bytesWritten >= sizeLimitBytes) {
bytesWritten = 0;
switchToNextWriter();
}
int writtenNow = currentWriter.put(header, data);
bytesWritten += writtenNow;
return writtenNow;
}
public void close() throws IOException {
currentWriter.close();
}
}

View File

@ -1,155 +0,0 @@
package nu.marginalia.index.journal.writer;
import com.github.luben.zstd.ZstdDirectBufferCompressingStream;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
/** IndexJournalWriter implementation that creates a single journal file */
public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
private static final int ZSTD_BUFFER_SIZE = 1<<16;
private static final int DATA_BUFFER_SIZE = 1<<16;
private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE);
private final ZstdDirectBufferCompressingStream compressingStream;
private final FileChannel fileChannel;
private int numEntries = 0;
private boolean closed = false;
private final Logger logger = LoggerFactory.getLogger(getClass());
public IndexJournalWriterSingleFileImpl(Path outputFile) throws IOException {
logger.info("Creating Journal Writer {}", outputFile);
Files.deleteIfExists(outputFile);
Files.createFile(outputFile, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
fileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE,
StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
writeHeaderPlaceholder(fileChannel);
compressingStream = new ZstdDirectBufferCompressingStream(ByteBuffer.allocateDirect(ZSTD_BUFFER_SIZE), 3) {
protected ByteBuffer flushBuffer(ByteBuffer toFlush) throws IOException {
toFlush.flip();
while (toFlush.hasRemaining()) {
fileChannel.write(toFlush);
}
toFlush.clear();
return toFlush;
}
};
}
/** The file has a non-compressed header at the beginning of the file.
* Write a placeholder first to reserve the bytes, and position the
* channel after the header
*/
private static void writeHeaderPlaceholder(FileChannel fileStream) throws IOException {
var buffer = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES);
buffer.position(0);
buffer.limit(buffer.capacity());
while (buffer.hasRemaining())
fileStream.write(buffer, buffer.position());
fileStream.position(IndexJournalReader.FILE_HEADER_SIZE_BYTES);
}
@Override
@SneakyThrows
public int put(IndexJournalEntryHeader header,
IndexJournalEntryData data)
{
final long[] keywords = data.termIds();
final long[] metadata = data.metadata();
final CodedSequence[] positions = data.positions();
int entrySize = 0;
for (var position : positions) {
entrySize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + position.bufferSize();
}
int totalSize = IndexJournalReader.DOCUMENT_HEADER_SIZE_BYTES + entrySize;
if (entrySize > DATA_BUFFER_SIZE) {
// This should never happen, but if it does, we should log it and deal with it in a way that doesn't corrupt the file
// (64 KB is *a lot* of data for a single document, larger than the uncompressed HTML in like the 95%th percentile of web pages)
logger.error("Omitting entry: Record size {} exceeds maximum representable size of {}", entrySize, DATA_BUFFER_SIZE);
return 0;
}
if (dataBuffer.remaining() < totalSize) {
dataBuffer.flip();
compressingStream.compress(dataBuffer);
dataBuffer.clear();
}
if (dataBuffer.remaining() < totalSize) {
logger.error("Omitting entry: Record size {} exceeds buffer size of {}", totalSize, dataBuffer.capacity());
return 0;
}
assert entrySize < (1 << 16) : "Entry size must not exceed USHORT_MAX";
dataBuffer.putShort((short) entrySize);
dataBuffer.putShort((short) Math.clamp(header.documentSize(), 0, Short.MAX_VALUE));
dataBuffer.putInt(header.documentFeatures());
dataBuffer.putLong(header.combinedId());
dataBuffer.putLong(header.documentMeta());
for (int i = 0; i < keywords.length; i++) {
dataBuffer.putLong(keywords[i]);
dataBuffer.putShort((short) metadata[i]);
dataBuffer.putShort((short) positions[i].bufferSize());
dataBuffer.put(positions[i].buffer());
}
numEntries++;
return totalSize;
}
public void close() throws IOException {
if (closed)
return;
else
closed = true;
dataBuffer.flip();
compressingStream.compress(dataBuffer);
dataBuffer.clear();
compressingStream.flush();
compressingStream.close();
// Finalize the file by writing a header in the beginning
ByteBuffer header = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES);
header.putLong(numEntries);
header.putLong(0); // reserved for future use
header.flip();
while (header.position() < header.limit()) {
fileChannel.write(header, header.position());
}
fileChannel.close();
}
}

View File

@ -1,448 +0,0 @@
package nu.marginalia.index.journal;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static org.junit.jupiter.api.Assertions.*;
public class IndexJournalWriterTest {
Path tempFile;
Path tempFile2;
ByteBuffer workArea = ByteBuffer.allocate(1024);
@BeforeEach
public void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
tempFile2 = Files.createTempFile(getClass().getSimpleName(), ".dat");
}
@AfterEach
public void tearDown() throws IOException {
Files.delete(tempFile);
Files.delete(tempFile2);
}
private GammaCodedSequence gcs(int... values) {
return GammaCodedSequence.generate(workArea, values);
}
static MurmurHash3_128 hasher = new MurmurHash3_128();
static long wordId(String str) {
return hasher.hashKeyword(str);
}
@Test
public void testSingleFile() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
new GammaCodedSequence[]{
gcs(2, 4, 6),
gcs(3, 5, 7),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderSingleFile(tempFile);
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 1 */
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(33, ptr.documentMeta());
assertEquals(10, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(55, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
/** DOCUMENT 2 */
assertTrue(ptr.nextDocument());
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
assertEquals(11, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(45, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(56, termData.metadata());
assertEquals(IntList.of(3, 5, 7), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
// No more documents
assertFalse(ptr.nextDocument());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testMultiFile() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) {
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
new GammaCodedSequence[]{
gcs(2, 4, 6),
gcs(3, 5, 7),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderPagingImpl(List.of(tempFile, tempFile2));
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 1 */
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(33, ptr.documentMeta());
assertEquals(10, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(55, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
/** DOCUMENT 2 */
assertTrue(ptr.nextDocument());
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
assertEquals(11, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(45, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(56, termData.metadata());
assertEquals(IntList.of(3, 5, 7), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
// No more documents
assertFalse(ptr.nextDocument());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testSingleFileIterTwice() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderSingleFile(tempFile);
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 1 */
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(10, ptr.documentSize());
assertEquals(33, ptr.documentMeta());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
// Ensure we can iterate again over the same document without persisting state or closing the pointer
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testFiltered() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
new GammaCodedSequence[]{
gcs(2, 4, 6),
gcs(3, 5, 7),
}
));
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderSingleFile(tempFile).filtering(meta -> meta == 45);
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 2 */
assertTrue(ptr.nextDocument());
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
assertEquals(11, ptr.documentSize());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(45, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
// No more documents
assertFalse(ptr.nextDocument());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testIntegrationScenario() throws IOException {
Map<Long, Integer> wordMap = new HashMap<>();
for (int i = 0; i < 512; i++) {
wordMap.put(hasher.hashKeyword(Integer.toString(i)), i);
}
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
for (int idc = 1; idc < 512; idc++) {
int id = idc;
int[] factors = IntStream
.rangeClosed(1, id)
.filter(v -> (id % v) == 0)
.toArray();
System.out.println("id:" + id + " factors: " + Arrays.toString(factors));
long fullId = UrlIdCodec.encodeId((32 - (id % 32)), id);
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(16);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, i + 1);
}
writer.put(header, new IndexJournalEntryData(keywords, metadata, positions));
}
}
try (var ptr = new IndexJournalReaderSingleFile(tempFile).newPointer()) {
while (ptr.nextDocument()) {
int ordinal = UrlIdCodec.getDocumentOrdinal(ptr.documentId());
System.out.println(ordinal);
var expectedFactors =
new LongArrayList(IntStream
.rangeClosed(1, ordinal)
.filter(v -> (ordinal % v) == 0)
.mapToObj(Integer::toString)
.mapToLong(hasher::hashKeyword)
.toArray());
LongList foundIds = new LongArrayList();
var iter = ptr.iterator();
while (iter.hasNext()) {
var termData = iter.next();
foundIds.add(termData.termId());
}
if (!expectedFactors.equals(foundIds)) {
System.out.println("Found: ");
System.out.println(foundIds.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
System.out.println("Expected: ");
System.out.println(expectedFactors.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
fail();
}
assertEquals(expectedFactors, foundIds);
}
}
}
}

View File

@ -16,11 +16,13 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies { dependencies {
implementation project(':code:libraries:array') implementation project(':code:libraries:array')
implementation project(':code:libraries:btree') implementation project(':code:libraries:btree')
implementation project(':code:libraries:slop')
implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:random-write-funnel') implementation project(':code:libraries:random-write-funnel')
implementation project(':code:index:query') implementation project(':code:index:query')
implementation project(':code:index:index-journal') implementation project(':code:index:index-journal')
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:processes:converting-process:model')
implementation project(':code:common:process') implementation project(':code:common:process')
implementation project(':third-party:parquet-floor') implementation project(':third-party:parquet-floor')
@ -34,5 +36,6 @@ dependencies {
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit
testImplementation libs.mockito testImplementation libs.mockito
testImplementation project(':code:libraries:test-helpers')
} }

View File

@ -1,10 +0,0 @@
package nu.marginalia.index.construction;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import java.io.IOException;
import java.nio.file.Path;
public interface JournalReaderSource {
IndexJournalReader construct(Path sourceFile) throws IOException;
}

View File

@ -2,10 +2,10 @@ package nu.marginalia.index.construction.full;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.JournalReaderSource;
import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -26,20 +26,17 @@ public class FullIndexConstructor {
private final Path outputFileDocs; private final Path outputFileDocs;
private final Path outputFileWords; private final Path outputFileWords;
private final Path outputFilePositions; private final Path outputFilePositions;
private final JournalReaderSource readerSource;
private final DocIdRewriter docIdRewriter; private final DocIdRewriter docIdRewriter;
private final Path tmpDir; private final Path tmpDir;
public FullIndexConstructor(Path outputFileDocs, public FullIndexConstructor(Path outputFileDocs,
Path outputFileWords, Path outputFileWords,
Path outputFilePositions, Path outputFilePositions,
JournalReaderSource readerSource,
DocIdRewriter docIdRewriter, DocIdRewriter docIdRewriter,
Path tmpDir) { Path tmpDir) {
this.outputFileDocs = outputFileDocs; this.outputFileDocs = outputFileDocs;
this.outputFileWords = outputFileWords; this.outputFileWords = outputFileWords;
this.outputFilePositions = outputFilePositions; this.outputFilePositions = outputFilePositions;
this.readerSource = readerSource;
this.docIdRewriter = docIdRewriter; this.docIdRewriter = docIdRewriter;
this.tmpDir = tmpDir; this.tmpDir = tmpDir;
} }
@ -48,8 +45,8 @@ public class FullIndexConstructor {
String processName, String processName,
Path sourceBaseDir) throws IOException Path sourceBaseDir) throws IOException
{ {
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir); var journal = IndexJournal.findJournal(sourceBaseDir);
if (inputs.isEmpty()) { if (journal.isEmpty()) {
logger.error("No journal files in base dir {}", sourceBaseDir); logger.error("No journal files in base dir {}", sourceBaseDir);
return; return;
} }
@ -62,10 +59,12 @@ public class FullIndexConstructor {
AtomicInteger progress = new AtomicInteger(0); AtomicInteger progress = new AtomicInteger(0);
inputs var journalVersions = journal.get().pages();
.parallelStream()
journalVersions
.stream()
.map(in -> { .map(in -> {
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size()); preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), journalVersions.size());
return construct(in, posConstructor); return construct(in, posConstructor);
}) })
.reduce(this::merge) .reduce(this::merge)
@ -80,9 +79,9 @@ public class FullIndexConstructor {
} }
@SneakyThrows @SneakyThrows
private FullPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) { private FullPreindexReference construct(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor) {
return FullPreindex return FullPreindex
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir) .constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir)
.closeToReference(); .closeToReference();
} }

View File

@ -8,7 +8,7 @@ import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.IndexSizeEstimator; import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.IndexJournalPage;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -43,7 +43,7 @@ public class FullPreindex {
/** Constructs a new preindex with the data associated with reader. The backing files /** Constructs a new preindex with the data associated with reader. The backing files
* will have randomly assigned names. * will have randomly assigned names.
*/ */
public static FullPreindex constructPreindex(IndexJournalReader reader, public static FullPreindex constructPreindex(IndexJournalPage journalInstance,
PositionsFileConstructor positionsFileConstructor, PositionsFileConstructor positionsFileConstructor,
DocIdRewriter docIdRewriter, DocIdRewriter docIdRewriter,
Path workDir) throws IOException Path workDir) throws IOException
@ -52,13 +52,13 @@ public class FullPreindex {
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat"); Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = FullPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); var segments = FullPreindexWordSegments.construct(journalInstance, segmentWordsFile, segmentCountsFile);
var docs = FullPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments); var docs = FullPreindexDocuments.construct(docsFile, workDir, journalInstance, docIdRewriter, positionsFileConstructor, segments);
return new FullPreindex(segments, docs); return new FullPreindex(segments, docs);
} }
/** Close the associated memory mapped areas and return /** Close the associated memory mapped areas and return
* a dehydrated version of this object that can be re-opened * a dehydrated page of this object that can be re-opened
* later. * later.
*/ */
public FullPreindexReference closeToReference() { public FullPreindexReference closeToReference() {

View File

@ -5,12 +5,13 @@ import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.rwf.RandomFileAssembler; import nu.marginalia.rwf.RandomFileAssembler;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -39,13 +40,13 @@ public class FullPreindexDocuments {
public static FullPreindexDocuments construct( public static FullPreindexDocuments construct(
Path docsFile, Path docsFile,
Path workDir, Path workDir,
IndexJournalReader reader, IndexJournalPage journalInstance,
DocIdRewriter docIdRewriter, DocIdRewriter docIdRewriter,
PositionsFileConstructor positionsFileConstructor, PositionsFileConstructor positionsFileConstructor,
FullPreindexWordSegments segments) throws IOException { FullPreindexWordSegments segments) throws IOException {
FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor; FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter);
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile); LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
sortDocsFile(docsFileMap, segments); sortDocsFile(docsFileMap, segments);
@ -68,28 +69,42 @@ public class FullPreindexDocuments {
private static void createUnsortedDocsFile(Path docsFile, private static void createUnsortedDocsFile(Path docsFile,
Path workDir, Path workDir,
IndexJournalReader reader, IndexJournalPage journalInstance,
FullPreindexWordSegments segments, FullPreindexWordSegments segments,
DocIdRewriter docIdRewriter) throws IOException { DocIdRewriter docIdRewriter) throws IOException {
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
final ByteBuffer tempBuffer = ByteBuffer.allocate(65536);
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
var pointer = reader.newPointer()) var docIds = journalInstance.openCombinedId();
var termCounts = journalInstance.openTermCounts();
var termIds = journalInstance.openTermIds();
var termMeta = journalInstance.openTermMetadata();
var positions = journalInstance.openTermPositions())
{ {
var offsetMap = segments.asMap(RECORD_SIZE_LONGS); var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0); offsetMap.defaultReturnValue(0);
while (pointer.nextDocument()) { while (termCounts.hasRemaining()) {
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId()); long docId = docIds.get();
for (var termData : pointer) { long rankEncodedId = docIdRewriter.rewriteDocId(docId);
long termId = termData.termId();
long termCount = termCounts.get();
for (int termIdx = 0; termIdx < termCount; termIdx++) {
long termId = termIds.get();
byte meta = termMeta.get();
// Read positions
tempBuffer.clear();
positions.getData(tempBuffer);
tempBuffer.flip();
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
long encodedPosOffset = positionsFileConstructor.add(meta, tempBuffer);
// write position data to the positions file and get the offset
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer());
assembly.put(offset + 0, rankEncodedId); assembly.put(offset + 0, rankEncodedId);
assembly.put(offset + 1, encodedPosOffset); assembly.put(offset + 1, encodedPosOffset);

View File

@ -5,7 +5,7 @@ import nu.marginalia.array.LongArrayFactory;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
/** This is a dehydrated version of a FullPreIndex, that only /** This is a dehydrated page of a FullPreIndex, that only
* keeps references to its location on disk but does not hold associated * keeps references to its location on disk but does not hold associated
* memory maps. * memory maps.
*/ */

View File

@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongIterator; import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.IndexJournalPage;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@ -51,14 +51,20 @@ public class FullPreindexWordSegments {
return ret; return ret;
} }
public static FullPreindexWordSegments construct(IndexJournalReader reader, public static FullPreindexWordSegments construct(IndexJournalPage journalInstance,
Path wordIdsFile, Path wordIdsFile,
Path countsFile) Path countsFile)
throws IOException throws IOException
{ {
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
countsMap.defaultReturnValue(0); countsMap.defaultReturnValue(0);
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
try (var termIds = journalInstance.openTermIds()) {
while (termIds.hasRemaining()) {
countsMap.addTo(termIds.get(), 1);
}
}
LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size()); LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size()); LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());

View File

@ -2,8 +2,8 @@ package nu.marginalia.index.construction.prio;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.JournalReaderSource; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalFileNames; import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -24,18 +24,15 @@ public class PrioIndexConstructor {
private final Path outputFileDocs; private final Path outputFileDocs;
private final Path outputFileWords; private final Path outputFileWords;
private final JournalReaderSource readerSource;
private final DocIdRewriter docIdRewriter; private final DocIdRewriter docIdRewriter;
private final Path tmpDir; private final Path tmpDir;
public PrioIndexConstructor(Path outputFileDocs, public PrioIndexConstructor(Path outputFileDocs,
Path outputFileWords, Path outputFileWords,
JournalReaderSource readerSource,
DocIdRewriter docIdRewriter, DocIdRewriter docIdRewriter,
Path tmpDir) { Path tmpDir) {
this.outputFileDocs = outputFileDocs; this.outputFileDocs = outputFileDocs;
this.outputFileWords = outputFileWords; this.outputFileWords = outputFileWords;
this.readerSource = readerSource;
this.docIdRewriter = docIdRewriter; this.docIdRewriter = docIdRewriter;
this.tmpDir = tmpDir; this.tmpDir = tmpDir;
} }
@ -44,8 +41,8 @@ public class PrioIndexConstructor {
String processName, String processName,
Path sourceBaseDir) throws IOException Path sourceBaseDir) throws IOException
{ {
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir); var journal = IndexJournal.findJournal(sourceBaseDir);
if (inputs.isEmpty()) { if (journal.isEmpty()) {
logger.error("No journal files in base dir {}", sourceBaseDir); logger.error("No journal files in base dir {}", sourceBaseDir);
return; return;
} }
@ -57,10 +54,12 @@ public class PrioIndexConstructor {
AtomicInteger progress = new AtomicInteger(0); AtomicInteger progress = new AtomicInteger(0);
inputs var journalVersions = journal.get().pages();
.parallelStream()
journalVersions
.stream()
.map(in -> { .map(in -> {
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size()); preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), journalVersions.size());
return construct(in); return construct(in);
}) })
.reduce(this::merge) .reduce(this::merge)
@ -75,9 +74,9 @@ public class PrioIndexConstructor {
} }
@SneakyThrows @SneakyThrows
private PrioPreindexReference construct(Path input) { private PrioPreindexReference construct(IndexJournalPage journalInstance) {
return PrioPreindex return PrioPreindex
.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir) .constructPreindex(journalInstance, docIdRewriter, tmpDir)
.closeToReference(); .closeToReference();
} }

View File

@ -6,7 +6,7 @@ import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.index.ReverseIndexParameters; import nu.marginalia.index.ReverseIndexParameters;
import nu.marginalia.index.construction.CountToOffsetTransformer; import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.IndexJournalPage;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -16,7 +16,8 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
import static nu.marginalia.array.algo.TwoArrayOperations.*; import static nu.marginalia.array.algo.TwoArrayOperations.countDistinctElements;
import static nu.marginalia.array.algo.TwoArrayOperations.mergeArrays;
/** Contains the data that would go into a reverse index, /** Contains the data that would go into a reverse index,
* that is, a mapping from words to documents, minus the actual * that is, a mapping from words to documents, minus the actual
@ -41,7 +42,7 @@ public class PrioPreindex {
/** Constructs a new preindex with the data associated with reader. The backing files /** Constructs a new preindex with the data associated with reader. The backing files
* will have randomly assigned names. * will have randomly assigned names.
*/ */
public static PrioPreindex constructPreindex(IndexJournalReader reader, public static PrioPreindex constructPreindex(IndexJournalPage indexJournalPage,
DocIdRewriter docIdRewriter, DocIdRewriter docIdRewriter,
Path workDir) throws IOException Path workDir) throws IOException
{ {
@ -49,13 +50,13 @@ public class PrioPreindex {
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat"); Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); var segments = PrioPreindexWordSegments.construct(indexJournalPage, segmentWordsFile, segmentCountsFile);
var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments); var docs = PrioPreindexDocuments.construct(docsFile, workDir, indexJournalPage, docIdRewriter, segments);
return new PrioPreindex(segments, docs); return new PrioPreindex(segments, docs);
} }
/** Close the associated memory mapped areas and return /** Close the associated memory mapped areas and return
* a dehydrated version of this object that can be re-opened * a dehydrated page of this object that can be re-opened
* later. * later.
*/ */
public PrioPreindexReference closeToReference() { public PrioPreindexReference closeToReference() {

View File

@ -4,7 +4,7 @@ import lombok.SneakyThrows;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.rwf.RandomFileAssembler; import nu.marginalia.rwf.RandomFileAssembler;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -37,11 +37,11 @@ public class PrioPreindexDocuments {
public static PrioPreindexDocuments construct( public static PrioPreindexDocuments construct(
Path docsFile, Path docsFile,
Path workDir, Path workDir,
IndexJournalReader reader, IndexJournalPage journalInstance,
DocIdRewriter docIdRewriter, DocIdRewriter docIdRewriter,
PrioPreindexWordSegments segments) throws IOException { PrioPreindexWordSegments segments) throws IOException {
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter);
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile); LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
sortDocsFile(docsFileMap, segments); sortDocsFile(docsFileMap, segments);
@ -54,37 +54,41 @@ public class PrioPreindexDocuments {
} }
public LongArray slice(long start, long end) {
return documents.range(start, end);
}
public long size() { public long size() {
return documents.size(); return documents.size();
} }
private static void createUnsortedDocsFile(Path docsFile, private static void createUnsortedDocsFile(Path docsFile,
Path workDir, Path workDir,
IndexJournalReader reader, IndexJournalPage journalInstance,
PrioPreindexWordSegments segments, PrioPreindexWordSegments segments,
DocIdRewriter docIdRewriter) throws IOException { DocIdRewriter docIdRewriter) throws IOException {
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
var pointer = reader.newPointer()) var docIds = journalInstance.openCombinedId();
var termIdsCounts = journalInstance.openTermCounts();
var termIds = journalInstance.openTermIds();
var termMeta = journalInstance.openTermMetadata())
{ {
var offsetMap = segments.asMap(RECORD_SIZE_LONGS); var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0); offsetMap.defaultReturnValue(0);
while (pointer.nextDocument()) { while (docIds.hasRemaining()) {
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId()); long docId = docIds.get();
for (var termData : pointer) { long rankEncodedId = docIdRewriter.rewriteDocId(docId);
long termId = termData.termId();
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); long termCount = termIdsCounts.get();
for (int termIdx = 0; termIdx < termCount; termIdx++) {
long termId = termIds.get();
byte meta = termMeta.get();
assembly.put(offset, rankEncodedId); if (meta != 0) {
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
assembly.put(offset, rankEncodedId);
}
} }
} }

View File

@ -5,7 +5,7 @@ import nu.marginalia.array.LongArrayFactory;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
/** This is a dehydrated version of a PrioPreIndex, that only /** This is a dehydrated page of a PrioPreIndex, that only
* keeps references to its location on disk but does not hold associated * keeps references to its location on disk but does not hold associated
* memory maps. * memory maps.
*/ */

View File

@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongIterator; import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.IndexJournalPage;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@ -51,14 +51,26 @@ public class PrioPreindexWordSegments {
return ret; return ret;
} }
public static PrioPreindexWordSegments construct(IndexJournalReader reader, public static PrioPreindexWordSegments construct(IndexJournalPage journalInstance,
Path wordIdsFile, Path wordIdsFile,
Path countsFile) Path countsFile)
throws IOException throws IOException
{ {
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
countsMap.defaultReturnValue(0); countsMap.defaultReturnValue(0);
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
try (var termIds = journalInstance.openTermIds();
var termMetas = journalInstance.openTermMetadata()) {
while (termIds.hasRemaining()) {
long data = termIds.get();
byte meta = termMetas.get();
if (meta != 0) {
countsMap.addTo(data, 1);
}
}
}
LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size()); LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size()); LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());

View File

@ -2,6 +2,7 @@ package nu.marginalia.index;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.construction.full.FullPreindex; import nu.marginalia.index.construction.full.FullPreindex;
@ -45,6 +46,11 @@ class FullReverseIndexReaderTest {
Files.delete(tempDir); Files.delete(tempDir);
} }
MurmurHash3_128 hash = new MurmurHash3_128();
long termId(String keyword) {
return hash.hashKeyword(keyword);
}
@Test @Test
public void testSimple() throws IOException { public void testSimple() throws IOException {
@ -52,18 +58,19 @@ class FullReverseIndexReaderTest {
new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5)) new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5))
); );
assertEquals(1, indexReader.numDocuments(50)); assertEquals(1, indexReader.numDocuments(termId("50")));
var positions = indexReader.getTermData(Arena.global(), 50, new long[] { 100 }); var positions = indexReader.getTermData(Arena.global(), termId("50"), new long[] { 100 });
assertEquals(1, positions.length); assertEquals(1, positions.length);
assertNotNull(positions[0]); assertNotNull(positions[0]);
assertEquals((byte) 51, positions[0].flags()); assertEquals((byte) 51, positions[0].flags());
assertEquals(IntList.of(1, 3, 5), positions[0].positions().values()); assertEquals(IntList.of(1, 3, 5), positions[0].positions().values());
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50")));
} }
@Test @Test
public void test2x2() throws IOException { public void test2x2() throws IOException {
@ -72,13 +79,13 @@ class FullReverseIndexReaderTest {
new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54)) new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54))
); );
assertEquals(1, indexReader.numDocuments(50)); assertEquals(1, indexReader.numDocuments(termId("50")));
assertEquals(2, indexReader.numDocuments(51)); assertEquals(2, indexReader.numDocuments(termId("51")));
assertEquals(1, indexReader.numDocuments(52)); assertEquals(1, indexReader.numDocuments(termId("52")));
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50")));
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51)); assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, termId("51")));
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52)); assertArrayEquals(new long[] { 101 }, readEntries(indexReader, termId("52")));
} }

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.construction.full; package nu.marginalia.index.construction.full;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
@ -53,33 +54,9 @@ class FullPreindexDocsTest {
Files.delete(tempDir); Files.delete(tempDir);
} }
@Test MurmurHash3_128 hash = new MurmurHash3_128();
public void testDocs() throws IOException { long termId(String keyword) {
var reader = journalFactory.createReader( return hash.hashKeyword(keyword);
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
new TestSegmentData(10, 2, 4, new long[] { -0xF00BA3L, 0 }),
new TestSegmentData(33, 4, 6, new long[] { -0xF00BA3L, 0 }),
new TestSegmentData(40, 6, 8, new long[] { -0xF00BA3L, 0 })
);
List<TestSegmentData> actual = new ArrayList<>();
var iter = segments.iterator(2);
while (iter.next()) {
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
data));
}
assertEquals(expected, actual);
} }
@Test @Test
@ -94,7 +71,7 @@ class FullPreindexDocsTest {
segments); segments);
List<TestSegmentData> expected = List.of( List<TestSegmentData> expected = List.of(
new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 }) new TestSegmentData(termId("4"), 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
); );
List<TestSegmentData> actual = new ArrayList<>(); List<TestSegmentData> actual = new ArrayList<>();

View File

@ -3,6 +3,7 @@ package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.model.BTreeHeader; import nu.marginalia.btree.model.BTreeHeader;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
@ -12,9 +13,11 @@ import org.junit.jupiter.api.Test;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.*; import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.*; import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
@ -51,6 +54,11 @@ class FullPreindexFinalizeTest {
Files.delete(tempDir); Files.delete(tempDir);
} }
MurmurHash3_128 hash = new MurmurHash3_128();
long termId(String keyword) {
return hash.hashKeyword(keyword);
}
@Test @Test
public void testFinalizeSimple() throws IOException { public void testFinalizeSimple() throws IOException {
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51))); var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
@ -81,7 +89,7 @@ class FullPreindexFinalizeTest {
assertEquals(1, wordsHeader.numEntries()); assertEquals(1, wordsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
} }
@ -121,8 +129,8 @@ class FullPreindexFinalizeTest {
long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1); long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1);
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3); long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
BTreeHeader docsHeader; BTreeHeader docsHeader;

View File

@ -1,435 +0,0 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
class FullPreindexMergeTest {
TestJournalFactory journalFactory;
Path countsFile;
Path wordsIdFile;
Path docsFile;
Path tempDir;
Path positionsFile;
@BeforeEach
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
positionsFile = Files.createTempFile("positions", ".dat");
countsFile = Files.createTempFile("counts", ".dat");
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
}
@AfterEach
public void tearDown() throws IOException {
journalFactory.clear();
Files.deleteIfExists(countsFile);
Files.deleteIfExists(wordsIdFile);
List<Path> contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
Files.delete(tempFile);
}
Files.delete(tempDir);
}
public FullPreindex runMergeScenario(
List<EntryDataWithWordMeta> leftData,
List<EntryDataWithWordMeta> rightData
) throws IOException {
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
var left = FullPreindex.constructPreindex(reader1, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
var right = FullPreindex.constructPreindex(reader2, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
return FullPreindex.merge(tempDir, left, right);
}
private List<TestSegmentData> getData(FullPreindex merged) {
var iter = merged.segments.iterator(2);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
merged.documents.slice(iter.startOffset, iter.endOffset).get(0, data);
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
data));
}
return actual;
}
@Test
@Disabled
public void testDocsMergeSingleNoOverlap() throws IOException {
IdSequence docIds = new IdSequence();
IdSequence docMetas = new IdSequence();
IdSequence wordMetas = new IdSequence();
IdSequence wordIds = new IdSequence();
var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
var merged = runMergeScenario(
leftSequence,
rightSequence
);
var actual = getData(merged);
var expected = simulateMerge(leftSequence, rightSequence);
System.out.println(actual);
assertEquals(expected, actual);
}
@Test
@Disabled
public void testDocsMergeSingleOnlyOverlap() throws IOException {
IdSequence docIds = new IdSequence();
IdSequence docMetas = new IdSequence();
IdSequence wordMetas = new IdSequence();
IdSequence wordIds = new IdSequence();
var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.alreadySeenSameSequence(), wordMetas.nextUnique())));
var merged = runMergeScenario(
leftSequence,
rightSequence
);
var actual = getData(merged);
var expected = simulateMerge(leftSequence, rightSequence);
System.out.println(actual);
assertEquals(expected, actual);
}
@Test
@Disabled
public void testDocsMergeSingleOnlyOverlap2() throws IOException {
long wid1 = 1;
long wid2 = 2;
IdSequence docIds = new IdSequence();
IdSequence docMetas = new IdSequence();
IdSequence wordMetas = new IdSequence();
var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(),
wm(wid1, wordMetas.nextUnique()),
wm(wid2, wordMetas.nextUnique())
));
var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(),
wm(wid1, wordMetas.nextUnique()),
wm(wid2, wordMetas.nextUnique())
));
var merged = runMergeScenario(
leftSequence,
rightSequence
);
var actual = getData(merged);
var expected = simulateMerge(leftSequence, rightSequence);
System.out.println(actual);
assertEquals(expected, actual);
}
@Test
@Disabled
public void testBadCase1() throws IOException {
long wordId = 0xF00F00BA3L;
List<EntryDataWithWordMeta> leftSequence = List.of(new EntryDataWithWordMeta(40, 50,
wm(wordId, 5))
);
List<EntryDataWithWordMeta> rightSequence = List.of(new EntryDataWithWordMeta(41, 51,
wm(wordId, 3),
wm(wordId, 4))
);
var mergedLR = runMergeScenario(
leftSequence,
rightSequence
);
var mergedRL = runMergeScenario(
rightSequence,
leftSequence
);
var actualLR = getData(mergedLR);
var actualRL = getData(mergedRL);
var expected = simulateMerge(leftSequence, rightSequence);
assertEquals(actualLR, actualRL);
if (!expected.equals(actualLR)) {
System.out.println("*fail*");
System.out.println(leftSequence);
System.out.println(rightSequence);
}
else {
System.out.println("*pass*");
}
assertEquals(expected, actualLR);
}
@Test
@Disabled
public void testBadCase2() throws IOException {
long wordId = 100;
List<EntryDataWithWordMeta> leftSequence = List.of(
new EntryDataWithWordMeta(1, 50, wm(wordId, 5)),
new EntryDataWithWordMeta(2, 50, wm(wordId, 5))
);
List<EntryDataWithWordMeta> rightSequence = List.of(
new EntryDataWithWordMeta(3, 50, wm(wordId, 5))
);
var mergedLR = runMergeScenario(
leftSequence,
rightSequence
);
var mergedRL = runMergeScenario(
rightSequence,
leftSequence
);
var actualLR = getData(mergedLR);
var actualRL = getData(mergedRL);
var expected = simulateMerge(leftSequence, rightSequence);
assertEquals(actualLR, actualRL);
if (!expected.equals(actualLR)) {
System.out.println("*fail*");
System.out.println(leftSequence);
System.out.println(rightSequence);
}
else {
System.out.println("*pass*");
}
assertEquals(expected, actualLR);
}
@Test
@Disabled
public void testFuzz() throws IOException {
Random r = new Random();
int maxDocs = 150;
int maxWords = 160;
int nIters = 1000;
for (int i = 0; i < nIters; i++) {
int nLeft = 1 + r.nextInt(maxDocs);
int nRight = 1 + r.nextInt(maxDocs);
IdSequence docIdsLeft = new IdSequence();
IdSequence docIdsRight = new IdSequence();
IdSequence docMetas = new IdSequence();
IdSequence wordMetas = new IdSequence();
IdSequence wordIds = new IdSequence();
List<EntryDataWithWordMeta> leftSequence = new ArrayList<>(nLeft);
for (int j = 0; j < nLeft; j++) {
WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)];
Arrays.setAll(words, idx -> {
long wordId = wordIds.seenWithP(1.0);
long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId);
return wm(wordId, wordMeta);
});
long docId = docIdsLeft.nextUnique();
long docMeta = docMetas.nextUniqueAssociatedWithKey(docId);
leftSequence.add(new EntryDataWithWordMeta(docId, docMeta, words));
}
List<EntryDataWithWordMeta> rightSequence = new ArrayList<>(nLeft);
for (int j = 0; j < nRight; j++) {
WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)];
Arrays.setAll(words, idx -> {
long wordId = wordIds.seenWithP(1.0);
long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId);
return wm(wordId, wordMeta);
});
long docId = docIdsRight.seenWithP(docIdsLeft, 0.1);
long docMeta = docMetas.nextUniqueAssociatedWithKey(docId);
rightSequence.add(new EntryDataWithWordMeta(docId, docMeta, words));
}
var mergedLR = runMergeScenario(
leftSequence,
rightSequence
);
var mergedRL = runMergeScenario(
rightSequence,
leftSequence
);
var actualLR = getData(mergedLR);
var actualRL = getData(mergedRL);
var expected = simulateMerge(leftSequence, rightSequence);
assertEquals(actualLR, actualRL);
if (!expected.equals(actualLR)) {
System.out.println("*fail*");
System.out.println(leftSequence);
System.out.println(rightSequence);
}
else {
System.out.println("*pass*");
}
assertEquals(expected, actualLR);
}
}
public List<TestSegmentData> simulateMerge(
Collection<EntryDataWithWordMeta> leftInputs,
Collection<EntryDataWithWordMeta> rightInputs
) {
TreeMap<Long, List<DocWithMeta>> wordToDocs = new TreeMap<>();
for (var entry : leftInputs) {
for (var wm : entry.wordIds()) {
wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add(
new DocWithMeta(entry.docId(), wm.meta())
);
}
}
for (var entry : rightInputs) {
for (var wm : entry.wordIds()) {
wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add(
new DocWithMeta(entry.docId(), wm.meta())
);
}
}
List<TestSegmentData> ret = new ArrayList<>();
int[] start = new int[1];
wordToDocs.forEach((wordId, docsList) -> {
docsList.sort(Comparator.naturalOrder());
var iter = docsList.iterator();
DocWithMeta prevVal = null;
DocWithMeta currentVal;
while (iter.hasNext()) {
currentVal = iter.next();
if (prevVal != null) {
if (currentVal.docId == prevVal.docId) {
iter.remove();
}
}
prevVal = currentVal;
}
long[] data = new long[docsList.size()*2];
for (int i = 0; i < docsList.size(); i++) {
data[2*i] = docsList.get(i).docId;
data[2*i + 1] = docsList.get(i).meta;
}
ret.add(new TestSegmentData(wordId, start[0], start[0] + data.length, data));
start[0] += data.length;
});
return ret;
}
record DocWithMeta(long docId, long meta) implements Comparable<DocWithMeta> {
@Override
public int compareTo(DocWithMeta o) {
return Long.compare(docId, o.docId);
}
}
class IdSequence {
Set<Long> seen = new HashSet<>();
Map<Long, Long> associatedValues = new HashMap<>();
private Random random = new Random();
/** Return alreadySeen() with probability p,
* else nextUnique()
*/
public long seenWithP(double p) {
if (isEmpty() || random.nextDouble() > p)
return nextUnique();
return alreadySeenSameSequence();
}
public long seenWithP(IdSequence other, double p) {
if (isEmpty() || random.nextDouble() > p)
return nextUnique();
return alreadySeenOtherSequence(other);
}
public long nextUnique() {
for (;;) {
long val = random.nextLong();
if (seen.add(val)) {
return val;
}
}
}
public long nextUniqueAssociatedWithKey(long key) {
return associatedValues.computeIfAbsent(key, k -> nextUnique());
}
public long alreadySeenSameSequence() {
long[] values = seen.stream().mapToLong(Long::longValue).toArray();
int idx = random.nextInt(0, values.length);
return values[idx];
}
public long alreadySeenOtherSequence(IdSequence other) {
List<Long> values = new ArrayList<>(other.seen);
Collections.shuffle(values);
for (Long maybe : values) {
if (seen.add(maybe))
return maybe;
}
return nextUnique();
}
public boolean isEmpty() {
return seen.isEmpty();
}
}
}

View File

@ -1,231 +0,0 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArray;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static org.junit.jupiter.api.Assertions.*;
class FullPreindexWordSegmentsTest {
Path countsFile;
Path wordsIdFile;
Path docsFile;
Path tempDir;
TestJournalFactory journalFactory;
@BeforeEach
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
countsFile = Files.createTempFile("counts", ".dat");
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
}
@AfterEach
public void tearDown() throws IOException {
journalFactory.clear();
Files.deleteIfExists(countsFile);
Files.deleteIfExists(wordsIdFile);
List<Path> contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
Files.delete(tempFile);
}
Files.delete(tempDir);
}
@Test
public void testWordSegmentsLongWordId() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 1L<<33)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
new TestSegmentData(1L<<33, 0, 1)
);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
}
assertEquals(expected, actual);
}
@Test
public void testWordSegmentsRepeatedWordId() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 5, 5)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
new TestSegmentData(5, 0, 2)
);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
}
assertEquals(expected, actual);
}
@Test
public void testWordSegments1() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 1),
new TestSegmentData(10, 1, 2),
new TestSegmentData(33, 2, 3),
new TestSegmentData(40, 3, 4)
);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
}
assertEquals(expected, actual);
}
@Test
public void testWordSegments2() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33),
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 2),
new TestSegmentData(10, 2, 3),
new TestSegmentData(15, 3, 4),
new TestSegmentData(30, 4, 5),
new TestSegmentData(33, 5, 7),
new TestSegmentData(40, 7, 8)
);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
}
assertEquals(expected, actual);
}
@Test
public void testWordSegments_ReadIterator() {
LongArray wordsArray = LongArray.allocate(4);
LongArray countsArray = LongArray.allocate(4);
wordsArray.set(0, -1, -2, -3, -4);
countsArray.set(0, 2, 1, 3, 5);
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
var ritr = segments.iterator(1);
assertTrue(ritr.hasMorePositions());
assertTrue(ritr.next());
assertTrue(ritr.isPositionBeforeEnd());
assertEquals(-1, ritr.wordId);
assertEquals(0, ritr.idx());
assertEquals(0, ritr.startOffset);
assertEquals(2, ritr.endOffset);
assertTrue(ritr.hasMorePositions());
assertTrue(ritr.next());
assertTrue(ritr.isPositionBeforeEnd());
assertEquals(-2, ritr.wordId);
assertEquals(1, ritr.idx());
assertEquals(2, ritr.startOffset);
assertEquals(3, ritr.endOffset);
assertTrue(ritr.hasMorePositions());
assertTrue(ritr.next());
assertTrue(ritr.isPositionBeforeEnd());
assertEquals(-3, ritr.wordId);
assertEquals(2, ritr.idx());
assertEquals(3, ritr.startOffset);
assertEquals(6, ritr.endOffset);
assertTrue(ritr.hasMorePositions());
assertTrue(ritr.next());
assertTrue(ritr.isPositionBeforeEnd());
assertEquals(-4, ritr.wordId);
assertEquals(3, ritr.idx());
assertEquals(6, ritr.startOffset);
assertEquals(11, ritr.endOffset);
assertFalse(ritr.hasMorePositions());
assertFalse(ritr.next());
assertFalse(ritr.isPositionBeforeEnd());
assertEquals(Long.MIN_VALUE, ritr.wordId);
}
@Test
public void testWordSegments_ConstructionIterator() {
LongArray wordsArray = LongArray.allocate(4);
LongArray countsArray = LongArray.allocate(4);
wordsArray.set(0, -1, -2, -3, -4);
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
var citr = segments.constructionIterator(1);
assertEquals(-1, citr.wordId);
assertEquals(0, citr.idx());
assertTrue(citr.canPutMore());
assertTrue(citr.putNext(1));
assertEquals(1, countsArray.get(0));
assertEquals(-2, citr.wordId);
assertEquals(1, citr.idx());
assertTrue(citr.canPutMore());
assertTrue(citr.putNext(2));
assertEquals(2, countsArray.get(1));
assertEquals(-3, citr.wordId);
assertEquals(2, citr.idx());
assertTrue(citr.canPutMore());
assertTrue(citr.putNext(3));
assertEquals(3, countsArray.get(2));
assertEquals(-4, citr.wordId);
assertEquals(3, citr.idx());
assertTrue(citr.canPutMore());
assertFalse(citr.putNext(4));
assertEquals(4, countsArray.get(3));
assertEquals(4, citr.idx());
assertFalse(citr.canPutMore());
assertEquals(Long.MIN_VALUE, citr.wordId);
}
}

View File

@ -1,17 +1,15 @@
package nu.marginalia.index.construction.full; package nu.marginalia.index.construction.full;
import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.test.TestUtil;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
@ -22,17 +20,13 @@ public class TestJournalFactory {
public TestJournalFactory() throws IOException {} public TestJournalFactory() throws IOException {}
public void clear() throws IOException { public void clear() throws IOException {
List<Path> toDelete = new ArrayList<>(); TestUtil.clearTempDir(tempDir);
try (var dirStream = Files.list(tempDir)) {
dirStream.forEach(toDelete::add);
}
for (var tempFile : toDelete) {
Files.delete(tempFile);
}
Files.delete(tempDir);
} }
public record EntryData(long docId, long docMeta, long... wordIds) { public record EntryData(long docId, long docMeta, String... wordIds) {
public EntryData(long docId, long docMeta, long... wordIds) {
this(docId, docMeta, Arrays.stream(wordIds).mapToObj(String::valueOf).toArray(String[]::new));
}
@Override @Override
public String toString() { public String toString() {
return "EntryData{" + return "EntryData{" +
@ -52,19 +46,23 @@ public class TestJournalFactory {
'}'; '}';
} }
} }
public record WordWithMeta(long wordId, long meta, GammaCodedSequence gcs) {} public record WordWithMeta(String wordId, byte meta, GammaCodedSequence gcs) {
public WordWithMeta(long wordId, byte meta, GammaCodedSequence gcs) {
public static WordWithMeta wm(long wordId, long meta, int... positions) { this(String.valueOf(wordId), meta, gcs);
return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions)); }
} }
public IndexJournalReader createReader(EntryData... entries) throws IOException { public static WordWithMeta wm(long wordId, int meta, int... positions) {
Path jf = Files.createTempFile(tempDir, "journal", ".dat"); return new WordWithMeta(wordId, (byte) meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
}
var writer = new IndexJournalWriterSingleFileImpl(jf); public IndexJournalPage createReader(EntryData... entries) throws IOException {
Path ji = Files.createTempDirectory(tempDir, "journal");
var writer = new IndexJournalSlopWriter(ji, 0);
for (var entry : entries) { for (var entry : entries) {
long[] termIds = new long[entry.wordIds.length]; String[] termIds = new String[entry.wordIds.length];
long[] meta = new long[entry.wordIds.length]; byte[] meta = new byte[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length]; GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) { for (int i = 0; i < entry.wordIds.length; i++) {
@ -73,22 +71,35 @@ public class TestJournalFactory {
positions[i] = new GammaCodedSequence(new byte[1]); positions[i] = new GammaCodedSequence(new byte[1]);
} }
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta), writer.put(
new IndexJournalEntryData(termIds, meta, positions)); entry.docId,
new SlopDocumentRecord.KeywordsProjection(
"test",
-1,
0,
entry.docMeta,
15,
Arrays.asList(termIds),
meta,
Arrays.asList(positions),
new byte[0],
List.of()
)
);
} }
writer.close(); writer.close();
var ret = new IndexJournalReaderSingleFile(jf);
return ret; return new IndexJournalPage(ji, 0);
} }
public IndexJournalReader createReader(EntryDataWithWordMeta... entries) throws IOException { public IndexJournalPage createReader(EntryDataWithWordMeta... entries) throws IOException {
Path jf = Files.createTempFile(tempDir, "journal", ".dat"); Path ji = Files.createTempDirectory(tempDir, "journal");
var writer = new IndexJournalWriterSingleFileImpl(jf); var writer = new IndexJournalSlopWriter(ji, 0);
for (var entry : entries) { for (var entry : entries) {
long[] termIds = new long[entry.wordIds.length]; String[] termIds = new String[entry.wordIds.length];
long[] meta = new long[entry.wordIds.length]; byte[] meta = new byte[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length]; GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) { for (int i = 0; i < entry.wordIds.length; i++) {
termIds[i] = entry.wordIds[i].wordId; termIds[i] = entry.wordIds[i].wordId;
@ -96,11 +107,25 @@ public class TestJournalFactory {
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1])); positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1]));
} }
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta), writer.put(
new IndexJournalEntryData(termIds, meta, positions)); entry.docId,
new SlopDocumentRecord.KeywordsProjection(
"test",
-1,
0,
entry.docMeta,
15,
Arrays.asList(termIds),
meta,
Arrays.asList(positions),
new byte[0],
List.of()
)
);
} }
writer.close(); writer.close();
var ret = new IndexJournalReaderSingleFile(jf);
return ret; return new IndexJournalPage(ji, 0);
} }
} }

View File

@ -2,8 +2,8 @@ package nu.marginalia.index.construction.full;
import java.util.Arrays; import java.util.Arrays;
record TestSegmentData(long wordId, long start, long end, long[] data) { record TestSegmentData(String wordId, long start, long end, long[] data) {
public TestSegmentData(long wordId, long start, long end) { public TestSegmentData(String wordId, long start, long end) {
this(wordId, start, end, null); this(wordId, start, end, null);
} }
@ -22,7 +22,7 @@ record TestSegmentData(long wordId, long start, long end, long[] data) {
@Override @Override
public int hashCode() { public int hashCode() {
int result = (int) (wordId ^ (wordId >>> 32)); int result = wordId.hashCode();
result = 31 * result + (int) (start ^ (start >>> 32)); result = 31 * result + (int) (start ^ (start >>> 32));
result = 31 * result + (int) (end ^ (end >>> 32)); result = 31 * result + (int) (end ^ (end >>> 32));
result = 31 * result + Arrays.hashCode(data); result = 31 * result + Arrays.hashCode(data);

View File

@ -1,6 +1,7 @@
package nu.marginalia.index.construction.prio; package nu.marginalia.index.construction.prio;
import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.PrioReverseIndexReader; import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.TestJournalFactory; import nu.marginalia.index.construction.full.TestJournalFactory;
@ -17,7 +18,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Random; import java.util.Random;
import static nu.marginalia.index.construction.full.TestJournalFactory.*; import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
import static nu.marginalia.index.construction.full.TestJournalFactory.wm; import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
@ -59,6 +60,11 @@ class PrioPreindexTest {
Files.delete(tempDir); Files.delete(tempDir);
} }
MurmurHash3_128 hash = new MurmurHash3_128();
long termId(String keyword) {
return hash.hashKeyword(keyword);
}
@Test @Test
public void testFinalizeSimple() throws IOException { public void testFinalizeSimple() throws IOException {
var journalReader = journalFactory.createReader( var journalReader = journalFactory.createReader(
@ -79,7 +85,7 @@ class PrioPreindexTest {
var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile); var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile);
var entrySource = indexReader.documents(50); var entrySource = indexReader.documents(termId("50"));
var lqb = new LongQueryBuffer(32); var lqb = new LongQueryBuffer(32);
entrySource.read(lqb); entrySource.read(lqb);
@ -139,10 +145,10 @@ class PrioPreindexTest {
var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile); var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile);
int items = indexReader.numDocuments(50); int items = indexReader.numDocuments(termId("50"));
assertEquals(documentIds.length, items); assertEquals(documentIds.length, items);
var entrySource = indexReader.documents(50); var entrySource = indexReader.documents(termId("50"));
var lqb = new LongQueryBuffer(32); var lqb = new LongQueryBuffer(32);
for (int pos = 0; pos < documentIds.length;) { for (int pos = 0; pos < documentIds.length;) {

View File

@ -1,43 +0,0 @@
package nu.marginalia.test;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
public class TestUtil {
public static void clearTempDir(Path dir) {
if (Files.isDirectory(dir)) {
for (File f : dir.toFile().listFiles()) {
File[] files = f.listFiles();
if (files != null) {
Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
}
System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")");
f.delete();
}
}
System.out.println("Deleting " + dir);
dir.toFile().delete();
}
private static String fileSize(Path path) {
try {
long sizeBytes = Files.size(path);
if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
return sizeBytes + "b";
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
private static String round(double d) {
return String.format("%.2f", d);
}
}

View File

@ -3,11 +3,11 @@ package nu.marginalia.index;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.positions.PositionsFileReader; import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.ForwardIndexReader;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@ -56,7 +56,8 @@ public class IndexFactory {
public ForwardIndexReader getForwardIndexReader() throws IOException { public ForwardIndexReader getForwardIndexReader() throws IOException {
return new ForwardIndexReader( return new ForwardIndexReader(
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT), ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT),
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT) ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT),
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.CURRENT)
); );
} }

View File

@ -13,7 +13,9 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.*; import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultSet;
import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchParameters;
@ -22,9 +24,9 @@ import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget; import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.IndexResultRankingService; import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.searchset.SearchSetsService; import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.searchset.SmallSearchSet; import nu.marginalia.index.searchset.SmallSearchSet;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -32,7 +34,8 @@ import org.slf4j.Marker;
import org.slf4j.MarkerFactory; import org.slf4j.MarkerFactory;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.*; import java.util.BitSet;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Executor; import java.util.concurrent.Executor;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
@ -142,7 +145,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
for (var score : rawResult.keywordScores) { for (var score : rawResult.keywordScores) {
rawItem.addKeywordScores( rawItem.addKeywordScores(
RpcResultKeywordScore.newBuilder() RpcResultKeywordScore.newBuilder()
.setEncodedWordMetadata(score.encodedWordMetadata()) .setFlags(score.flags)
.setPositions(score.positionCount)
.setKeyword(score.keyword) .setKeyword(score.keyword)
); );
} }

View File

@ -90,7 +90,7 @@ public class StatefulIndex {
return combinedIndexReader != null; return combinedIndexReader != null;
} }
/** Stronger version of isAvailable() that also checks that the index is loaded */ /** Stronger page of isAvailable() that also checks that the index is loaded */
public boolean isLoaded() { public boolean isLoaded() {
return combinedIndexReader != null && combinedIndexReader.isLoaded(); return combinedIndexReader != null && combinedIndexReader.isLoaded();
} }

View File

@ -1,13 +1,16 @@
package nu.marginalia.index.results; package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.*; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.QuerySearchTerms; import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
@ -15,13 +18,13 @@ import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations; import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*; import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
/** This class is responsible for calculating the score of a search result. /** This class is responsible for calculating the score of a search result.
* It holds the data required to perform the scoring, as there is strong * It holds the data required to perform the scoring, as there is strong
@ -102,7 +105,7 @@ public class IndexResultScoreCalculator {
} }
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) { private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent); boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask)); int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
int positionsCount = intMaxMinAggregate(countsQuery, p -> p); int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
@ -139,27 +142,27 @@ public class IndexResultScoreCalculator {
} }
return booleanAggregate(queryGraphScores, return booleanAggregate(queryGraphScores,
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); flags -> meetsQueryStrategyRequirements((byte) flags, queryParams.queryStrategy()));
} }
private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) { private boolean meetsQueryStrategyRequirements(byte flags, QueryStrategy queryStrategy) {
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
return WordFlags.Site.isPresent(wordMeta); return WordFlags.Site.isPresent(flags);
} }
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
return WordFlags.Subjects.isPresent(wordMeta); return WordFlags.Subjects.isPresent(flags);
} }
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
return WordFlags.Title.isPresent(wordMeta); return WordFlags.Title.isPresent(flags);
} }
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
return WordFlags.UrlPath.isPresent(wordMeta); return WordFlags.UrlPath.isPresent(flags);
} }
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
return WordFlags.UrlDomain.isPresent(wordMeta); return WordFlags.UrlDomain.isPresent(flags);
} }
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
return WordFlags.ExternalLink.isPresent(wordMeta); return WordFlags.ExternalLink.isPresent(flags);
} }
return true; return true;
} }

View File

@ -13,10 +13,8 @@ import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.positions.TermData; import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.docs.DocumentDbReader;
@ -27,9 +25,10 @@ import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
@ -63,7 +62,7 @@ public class CombinedIndexReaderTest {
StatefulIndex statefulIndex; StatefulIndex statefulIndex;
@Inject @Inject
IndexJournalWriter indexJournalWriter; IndexJournalSlopWriter indexJournalWriter;
@Inject @Inject
FileStorageService fileStorageService; FileStorageService fileStorageService;
@ -248,7 +247,6 @@ public class CombinedIndexReaderTest {
outputFileDocs, outputFileDocs,
outputFileWords, outputFileWords,
outputFilePositions, outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(), DocIdRewriter.identity(),
tmpDir); tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
@ -268,7 +266,6 @@ public class CombinedIndexReaderTest {
outputFileDocs, outputFileDocs,
outputFileWords, outputFileWords,
outputFilePositions, outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(), DocIdRewriter.identity(),
tmpDir); tmpDir);
@ -279,12 +276,14 @@ public class CombinedIndexReaderTest {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
IndexJournalReader.paging(workDir),
outputFileDocsId, outputFileDocsId,
outputFileDocsData, outputFileDocsData,
outputFileSpansData,
IndexJournal.findJournal(workDir).orElseThrow(),
domainRankings domainRankings
); );
@ -318,19 +317,26 @@ public class CombinedIndexReaderTest {
var meta = metaByDoc.get(doc); var meta = metaByDoc.get(doc);
var header = new IndexJournalEntryHeader( List<String> keywords = words.stream().map(w -> w.keyword).toList();
doc, byte[] metadata = new byte[words.size()];
meta.features, for (int i = 0; i < words.size(); i++) {
100, metadata[i] = words.get(i).termMetadata;
meta.documentMetadata.encode() }
); var positions = words.stream().map(w -> w.positions).map(pos -> (CodedSequence) GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toList();
String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new); indexJournalWriter.put(doc,
long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray(); new SlopDocumentRecord.KeywordsProjection(
var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toArray(GammaCodedSequence[]::new); "",
-1,
indexJournalWriter.put(header, meta.features,
new IndexJournalEntryData(keywords, metadata, positions)); meta.documentMetadata.encode(),
100,
keywords,
metadata,
positions,
new byte[0],
List.of()
));
}); });
var linkdbWriter = new DocumentDbWriter( var linkdbWriter = new DocumentDbWriter(
@ -370,10 +376,10 @@ public class CombinedIndexReaderTest {
} }
record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {} record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {}
record MockDataKeyword(String keyword, long termMetadata, IntList positions) {} record MockDataKeyword(String keyword, byte termMetadata, IntList positions) {}
MockDataKeyword w(String keyword, WordFlags flags, int... positions) { MockDataKeyword w(String keyword, WordFlags flags, int... positions) {
return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of(positions)); return new MockDataKeyword(keyword, flags.asBit(), IntList.of(positions));
} }
} }

View File

@ -4,23 +4,18 @@ import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
@ -29,12 +24,16 @@ import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
@ -70,7 +69,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
ServiceHeartbeat heartbeat; ServiceHeartbeat heartbeat;
@Inject @Inject
IndexJournalWriter indexJournalWriter; IndexJournalSlopWriter indexJournalWriter;
@Inject @Inject
FileStorageService fileStorageService; FileStorageService fileStorageService;
@ -296,7 +295,6 @@ public class IndexQueryServiceIntegrationSmokeTest {
outputFileDocs, outputFileDocs,
outputFileWords, outputFileWords,
outputFilePositions, outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(), DocIdRewriter.identity(),
tmpDir); tmpDir);
@ -316,7 +314,6 @@ public class IndexQueryServiceIntegrationSmokeTest {
var constructor = new PrioIndexConstructor( var constructor = new PrioIndexConstructor(
outputFileDocs, outputFileDocs,
outputFileWords, outputFileWords,
IndexJournalReader::singleFile,
DocIdRewriter.identity(), DocIdRewriter.identity(),
tmpDir); tmpDir);
@ -327,12 +324,14 @@ public class IndexQueryServiceIntegrationSmokeTest {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
IndexJournalReader.paging(workDir),
outputFileDocsId, outputFileDocsId,
outputFileDocsData, outputFileDocsData,
outputFileSpansData,
IndexJournal.findJournal(workDir).orElseThrow(),
domainRankings domainRankings
); );
@ -354,32 +353,44 @@ public class IndexQueryServiceIntegrationSmokeTest {
long fullId = fullId(id); long fullId = fullId(id);
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
ldbw.add(new DocdbUrlDetail( ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id), fullId, new EdgeUrl("https://www.example.com/"+id),
"test", "test", 0., "HTML5", 0, null, 0, 10 "test", "test", 0., "HTML5", 0, null, 0, 10
)); ));
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new); List<String> keywords = IntStream.of(factors).mapToObj(Integer::toString).toList();
long[] metadata = new long[factors.length]; byte[] metadata = new byte[factors.length];
for (int i = 0; i < factors.length; i++) { for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); metadata[i] = WordFlags.Title.asBit();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, factors);
} }
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions)); List<CodedSequence> positions = new ArrayList<>();
ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) {
positions.add(GammaCodedSequence.generate(wa, factors));
}
indexJournalWriter.put(fullId,
new SlopDocumentRecord.KeywordsProjection(
"",
-1,
0,
new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode(),
100,
keywords,
metadata,
positions,
new byte[0],
List.of()
));
} }
@SneakyThrows @SneakyThrows
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) { public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
long fullId = UrlIdCodec.encodeId(domain, id); long fullId = UrlIdCodec.encodeId(domain, id);
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, DocumentMetadata.defaultValue());
ldbw.add(new DocdbUrlDetail( ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id), fullId, new EdgeUrl("https://www.example.com/"+id),
@ -387,18 +398,33 @@ public class IndexQueryServiceIntegrationSmokeTest {
)); ));
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new); List<String> keywords = IntStream.of(factors).mapToObj(Integer::toString).toList();
long[] metadata = new long[factors.length]; byte[] metadata = new byte[factors.length];
for (int i = 0; i < factors.length; i++) { for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); metadata[i] = WordFlags.Title.asBit();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(16);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, i + 1);
} }
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions)); List<CodedSequence> positions = new ArrayList<>();
ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) {
positions.add(GammaCodedSequence.generate(wa, i + 1));
}
indexJournalWriter.put(fullId,
new SlopDocumentRecord.KeywordsProjection(
"",
-1,
0,
new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode(),
100,
keywords,
metadata,
positions,
new byte[0],
List.of()
));
} }
} }

View File

@ -5,22 +5,19 @@ import com.google.inject.Inject;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
@ -33,12 +30,14 @@ import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
import org.apache.logging.log4j.util.Strings; import org.apache.logging.log4j.util.Strings;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
@ -76,7 +75,7 @@ public class IndexQueryServiceIntegrationTest {
ServiceHeartbeat heartbeat; ServiceHeartbeat heartbeat;
@Inject @Inject
IndexJournalWriter indexJournalWriter; IndexJournalSlopWriter indexJournalWriter;
@Inject @Inject
FileStorageService fileStorageService; FileStorageService fileStorageService;
@ -475,7 +474,6 @@ public class IndexQueryServiceIntegrationTest {
outputFileDocs, outputFileDocs,
outputFileWords, outputFileWords,
outputFilePositions, outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(), DocIdRewriter.identity(),
tmpDir); tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
@ -493,7 +491,6 @@ public class IndexQueryServiceIntegrationTest {
var constructor = new PrioIndexConstructor( var constructor = new PrioIndexConstructor(
outputFileDocs, outputFileDocs,
outputFileWords, outputFileWords,
IndexJournalReader::singleFile,
DocIdRewriter.identity(), DocIdRewriter.identity(),
tmpDir); tmpDir);
@ -504,12 +501,14 @@ public class IndexQueryServiceIntegrationTest {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
IndexJournalReader.paging(workDir),
outputFileDocsId, outputFileDocsId,
outputFileDocsData, outputFileDocsData,
outputFileSpansData,
IndexJournal.findJournal(workDir).orElseThrow(),
domainRankings domainRankings
); );
@ -539,24 +538,32 @@ public class IndexQueryServiceIntegrationTest {
var meta = metaByDoc.get(doc); var meta = metaByDoc.get(doc);
var header = new IndexJournalEntryHeader( List<String> keywords = words.stream().map(w -> w.keyword).toList();
doc,
meta.features,
100,
meta.documentMetadata.encode()
);
String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new); byte[] metadata = new byte[keywords.size()];
long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray(); for (int i = 0; i < words.size(); i++) {
metadata[i] = (byte) words.get(i).termMetadata;
GammaCodedSequence[] positions = new GammaCodedSequence[words.size()]; // FIXME: positions?
ByteBuffer workBuffer = ByteBuffer.allocate(8192);
for (int i = 0; i < positions.length; i++) {
positions[i] = GammaCodedSequence.generate(workBuffer, words.get(i).positions);
} }
indexJournalWriter.put(header, List<CodedSequence> positions = new ArrayList<>();
new IndexJournalEntryData(keywords, metadata, positions)); ByteBuffer workBuffer = ByteBuffer.allocate(8192);
for (int i = 0; i < words.size(); i++) {
positions.add(GammaCodedSequence.generate(workBuffer, words.get(i).positions));
}
indexJournalWriter.put(doc,
new SlopDocumentRecord.KeywordsProjection(
"",
-1,
meta.features,
meta.documentMetadata.encode(),
100,
keywords,
metadata,
positions,
new byte[0],
List.of()
));
}); });
var linkdbWriter = new DocumentDbWriter( var linkdbWriter = new DocumentDbWriter(
@ -599,8 +606,8 @@ public class IndexQueryServiceIntegrationTest {
record MockDataKeyword(String keyword, long termMetadata, IntList positions) {} record MockDataKeyword(String keyword, long termMetadata, IntList positions) {}
public MockDataKeyword w(String keyword, EnumSet<WordFlags> wordFlags, int... positions) { public MockDataKeyword w(String keyword, EnumSet<WordFlags> wordFlags, int... positions) {
return new MockDataKeyword(keyword, new WordMetadata(0, wordFlags).encode(), IntList.of(positions)); return new MockDataKeyword(keyword, WordFlags.encode(wordFlags), IntList.of(positions));
} }
public MockDataKeyword w(String keyword) { return new MockDataKeyword(keyword, 0L, IntList.of()); } public MockDataKeyword w(String keyword) { return new MockDataKeyword(keyword, 0L, IntList.of()); }
public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of()); } public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, flags.asBit(), IntList.of()); }
} }

View File

@ -2,21 +2,23 @@ package nu.marginalia.index;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.searchset.SearchSetAny; import nu.marginalia.index.searchset.SearchSetAny;
import nu.marginalia.index.searchset.SearchSetsService; import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.util.TestUtil;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBase;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.service.control.*;
import nu.marginalia.service.ServiceId; import nu.marginalia.service.ServiceId;
import nu.marginalia.service.control.FakeServiceHeartbeat;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBase;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.test.TestUtil;
import org.mockito.Mockito; import org.mockito.Mockito;
import java.io.IOException; import java.io.IOException;
@ -41,8 +43,10 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule {
slowDir = workDir.resolve("slow"); slowDir = workDir.resolve("slow");
fastDir = workDir.resolve("fast"); fastDir = workDir.resolve("fast");
Files.createDirectory(slowDir); Files.createDirectory(slowDir);
Files.createDirectory(fastDir); Files.createDirectory(fastDir);
Files.createDirectory(fastDir.resolve("iw"));
} }
public void cleanUp() { public void cleanUp() {
@ -75,9 +79,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule {
bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class));
bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterPagingImpl( bind(IndexJournalSlopWriter.class).toInstance(new IndexJournalSlopWriter(IndexJournal.allocateName(fastDir.resolve("iw")), 0));
IndexLocations.getIndexConstructionArea(fileStorageServiceMock)
));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(
ServiceId.Index, ServiceId.Index,

View File

@ -1,44 +0,0 @@
package nu.marginalia.index.util;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
public class TestUtil {
public static void clearTempDir(Path path) {
if (Files.isDirectory(path)) {
for (File f : path.toFile().listFiles()) {
File[] files = f.listFiles();
if (files != null) {
Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
}
System.out.println("Deleting " + f);
f.delete();
}
}
System.out.println("Deleting " + path + " (" + fileSize(path) + ")");
path.toFile().delete();
}
private static String fileSize(Path path) {
try {
long sizeBytes = Files.size(path);
if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
return sizeBytes + "b";
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
private static String round(double d) {
return String.format("%.2f", d);
}
}

View File

@ -26,6 +26,8 @@ dependencies {
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit
testImplementation libs.mockito testImplementation libs.mockito
testImplementation project(':code:libraries:test-helpers')
} }
jmh { jmh {

View File

@ -4,7 +4,7 @@ import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.util.test.TestUtil; import nu.marginalia.test.TestUtil;
import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.ArrayUtils;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Tag;
@ -13,7 +13,7 @@ import org.junit.jupiter.api.Test;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.*; import java.util.Random;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;

View File

@ -3,7 +3,7 @@ package nu.marginalia.array.algo;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.util.test.TestUtil; import nu.marginalia.test.TestUtil;
import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.ArrayUtils;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Tag;

View File

@ -1,43 +0,0 @@
package nu.marginalia.util.test;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
public class TestUtil {
public static void clearTempDir(Path dir) {
if (Files.isDirectory(dir)) {
for (File f : dir.toFile().listFiles()) {
File[] files = f.listFiles();
if (files != null) {
Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
}
System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")");
f.delete();
}
}
System.out.println("Deleting " + dir);
dir.toFile().delete();
}
private static String fileSize(Path path) {
try {
long sizeBytes = Files.size(path);
if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
return sizeBytes + "b";
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
private static String round(double d) {
return String.format("%.2f", d);
}
}

View File

@ -1,12 +1,11 @@
package nu.marginalia.sequence; package nu.marginalia.sequence;
import blue.strategic.parquet.BinarySerializable;
import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
public interface CodedSequence extends BinarySerializable { public interface CodedSequence {
byte[] bytes(); byte[] bytes();
IntIterator iterator(); IntIterator iterator();

View File

@ -158,7 +158,7 @@ public class GammaCodedSequence implements Iterable<Integer>, CodedSequence {
last = i; last = i;
// can't encode zeroes // can't encode zeroes
assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values"; assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values; was " + sequence;
writer.putGamma(delta); writer.putGamma(delta);
} }

View File

@ -1,21 +1,24 @@
package nu.marginalia.language.sentence.tag; package nu.marginalia.language.sentence.tag;
public enum HtmlTag { public enum HtmlTag {
SCRIPT(true, false), SCRIPT('s', true, false),
STYLE(true, false), STYLE('S', true, false),
CODE(false, true), CODE('c', false, true),
PRE(false, true), PRE('p', false, true),
TITLE(false, false), TITLE('t', false, false),
HEADING(false, false), HEADING('h', false, false),
NAV(false, false), NAV('n', false, false),
HEADER(false, false), HEADER('H',false, false),
FOOTER(false, false); FOOTER('f', false, false);
public char code;
public boolean exclude; public boolean exclude;
public boolean nonLanguage; public boolean nonLanguage;
HtmlTag(boolean exclude, boolean nonLanguage) { HtmlTag(char code, boolean exclude, boolean nonLanguage) {
this.code = code;
this.exclude = exclude; this.exclude = exclude;
this.nonLanguage = nonLanguage; this.nonLanguage = nonLanguage;
} }
} }

View File

@ -15,6 +15,8 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies { dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation project(':code:libraries:coded-sequence')
implementation libs.notnull implementation libs.notnull
implementation libs.commons.lang3 implementation libs.commons.lang3
implementation libs.fastutil implementation libs.fastutil
@ -22,6 +24,7 @@ dependencies {
implementation libs.guava implementation libs.guava
implementation libs.commons.compress implementation libs.commons.compress
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit
testImplementation libs.mockito testImplementation libs.mockito

View File

@ -0,0 +1,121 @@
package nu.marginalia.slop.column.dynamic;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.ColumnFunction;
import nu.marginalia.slop.desc.ColumnType;
import nu.marginalia.slop.desc.StorageType;
import nu.marginalia.slop.storage.Storage;
import nu.marginalia.slop.storage.StorageReader;
import nu.marginalia.slop.storage.StorageWriter;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Path;
public class GammaCodedSequenceColumn {
public static GammaCodedSequenceReader open(Path path, ColumnDesc name) throws IOException {
return new Reader(
Storage.reader(path, name, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment
VarintColumn.open(path, name.createDerivative(ColumnFunction.DATA_LEN,
ColumnType.VARINT_LE,
StorageType.PLAIN)
)
);
}
public static GammaCodedSequenceWriter create(Path path, ColumnDesc name) throws IOException {
return new Writer(
Storage.writer(path, name),
VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA_LEN,
ColumnType.VARINT_LE,
StorageType.PLAIN)
)
);
}
private static class Writer implements GammaCodedSequenceWriter {
private final VarintColumnWriter indexWriter;
private final StorageWriter storage;
public Writer(StorageWriter storage,
VarintColumnWriter indexWriter)
{
this.storage = storage;
this.indexWriter = indexWriter;
}
@Override
public void put(GammaCodedSequence sequence) throws IOException {
var buffer = sequence.buffer();
int length = buffer.remaining();
indexWriter.put(length);
storage.putBytes(buffer);
}
public void close() throws IOException {
indexWriter.close();
storage.close();
}
}
private static class Reader implements GammaCodedSequenceReader {
private final VarintColumnReader indexReader;
private final StorageReader storage;
public Reader(StorageReader reader, VarintColumnReader indexReader) throws IOException {
this.storage = reader;
this.indexReader = indexReader;
}
@Override
public void skip(long positions) throws IOException {
for (int i = 0; i < positions; i++) {
int size = (int) indexReader.get();
storage.skip(size, 1);
}
}
@Override
public boolean hasRemaining() throws IOException {
return indexReader.hasRemaining();
}
public long position() throws IOException {
return indexReader.position();
}
@Override
public GammaCodedSequence get(ByteBuffer workArea) throws IOException {
int size = (int) indexReader.get();
workArea.clear();
workArea.limit(size);
storage.getBytes(workArea);
workArea.flip();
return new GammaCodedSequence(workArea);
}
@Override
public void getData(ByteBuffer workArea) throws IOException {
int size = (int) indexReader.get();
int oldLimit = workArea.limit();
workArea.limit(workArea.position() + size);
storage.getBytes(workArea);
workArea.limit(oldLimit);
}
public void close() throws IOException {
indexReader.close();
storage.close();
}
}
}

View File

@ -0,0 +1,34 @@
package nu.marginalia.slop.column.dynamic;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.slop.column.ColumnReader;
import java.io.IOException;
import java.nio.ByteBuffer;
public interface GammaCodedSequenceReader extends AutoCloseable, ColumnReader {
/** Read the next gamma-coded sequence from the column. Unlike most other
* readers, this method requires an intermediate buffer to use for reading
* the sequence. As this buffer typically needs to be fairly large to accommodate
* the largest possible sequence, it is not practical to allocate a new buffer
* for each call to this method. Instead, the caller should allocate a buffer
* once and reuse it for each call to this method.
*
* @param workArea A buffer to use for reading the sequence.
* @return The next gamma-coded sequence.
*/
CodedSequence get(ByteBuffer workArea) throws IOException;
/** Read just the data portion of the next gamma-coded sequence from the column.
* This method is useful when the caller is only interested in the data portion
* of the sequence and does not want to decode the values.
*
* The position of the buffer is advanced to the end of the data that has just been read,
* and the limit remains the same.
*
* @param workArea A buffer to use for reading the data.
*/
void getData(ByteBuffer workArea) throws IOException;
void close() throws IOException;
}

View File

@ -0,0 +1,11 @@
package nu.marginalia.slop.column.dynamic;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.column.ColumnWriter;
import java.io.IOException;
public interface GammaCodedSequenceWriter extends AutoCloseable, ColumnWriter {
void put(GammaCodedSequence sequence) throws IOException;
void close() throws IOException;
}

View File

@ -47,6 +47,7 @@ public abstract class ColumnType<
public static ColumnType<VarintColumnReader, VarintColumnWriter> VARINT_LE = register("varintle", ByteOrder.LITTLE_ENDIAN, VarintColumn::open, VarintColumn::create); public static ColumnType<VarintColumnReader, VarintColumnWriter> VARINT_LE = register("varintle", ByteOrder.LITTLE_ENDIAN, VarintColumn::open, VarintColumn::create);
public static ColumnType<VarintColumnReader, VarintColumnWriter> VARINT_BE = register("varintbe", ByteOrder.BIG_ENDIAN, VarintColumn::open, VarintColumn::create); public static ColumnType<VarintColumnReader, VarintColumnWriter> VARINT_BE = register("varintbe", ByteOrder.BIG_ENDIAN, VarintColumn::open, VarintColumn::create);
public static ColumnType<CustomBinaryColumnReader, CustomBinaryColumnWriter> BYTE_ARRAY_CUSTOM = register("s8[]+custom", ByteOrder.nativeOrder(), CustomBinaryColumn::open, CustomBinaryColumn::create); public static ColumnType<CustomBinaryColumnReader, CustomBinaryColumnWriter> BYTE_ARRAY_CUSTOM = register("s8[]+custom", ByteOrder.nativeOrder(), CustomBinaryColumn::open, CustomBinaryColumn::create);
public static ColumnType<GammaCodedSequenceReader, GammaCodedSequenceWriter> BYTE_ARRAY_GCS = register("s8[]+gcs", ByteOrder.nativeOrder(), GammaCodedSequenceColumn::open, GammaCodedSequenceColumn::create);
public static ColumnType<StringColumnReader, StringColumnWriter> STRING = register("s8[]+str", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create); public static ColumnType<StringColumnReader, StringColumnWriter> STRING = register("s8[]+str", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create);
public static ColumnType<StringColumnReader, StringColumnWriter> CSTRING = register("s8+cstr", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create); public static ColumnType<StringColumnReader, StringColumnWriter> CSTRING = register("s8+cstr", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create);
public static ColumnType<StringColumnReader, StringColumnWriter> TXTSTRING = register("s8+txt", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create); public static ColumnType<StringColumnReader, StringColumnWriter> TXTSTRING = register("s8+txt", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create);

View File

@ -13,7 +13,9 @@ public class TestUtil {
return; return;
if (Files.isDirectory(path)) { if (Files.isDirectory(path)) {
for (File f : path.toFile().listFiles()) { var contents = path.toFile().listFiles();
for (File f : contents) {
if (f.isDirectory()) { if (f.isDirectory()) {
File[] files = f.listFiles(); File[] files = f.listFiles();
if (files != null) { if (files != null) {

View File

@ -1,32 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation project(':third-party:parquet-floor')
implementation project(':code:common:config')
implementation project(':code:common:db')
implementation project(':code:common:linkdb')
implementation libs.notnull
implementation libs.trove
implementation libs.bundles.parquet
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -1,16 +0,0 @@
# Crawl Spec
A crawl spec is a list of domains to be crawled. It is a parquet file with the following columns:
- `domain`: The domain to be crawled
- `crawlDepth`: The depth to which the domain should be crawled
- `urls`: A list of known URLs to be crawled
Crawl specs are used to define the scope of a crawl in the absence of known domains.
The [CrawlSpecRecord](java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java) class is
used to represent a record in the crawl spec.
The [CrawlSpecRecordParquetFileReader](java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java)
and [CrawlSpecRecordParquetFileWriter](java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java)
classes are used to read and write the crawl spec parquet files.

View File

@ -1,37 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader;
import nu.marginalia.model.processed.DocumentRecord;
import nu.marginalia.model.processed.DocumentRecordKeywordsProjection;
import nu.marginalia.model.processed.DocumentRecordMetadataProjection;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.stream.Stream;
public class DocumentRecordParquetFileReader {
@NotNull
public static Stream<DocumentRecord> stream(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DocumentRecord.newHydrator()));
}
@NotNull
public static Stream<DocumentRecordKeywordsProjection> streamKeywordsProjection(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DocumentRecordKeywordsProjection.newHydrator()),
DocumentRecordKeywordsProjection.requiredColumns()
);
}
@NotNull
public static Stream<DocumentRecordMetadataProjection> streamMetadataProjection(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DocumentRecordMetadataProjection.newHydrator()),
DocumentRecordMetadataProjection.requiredColumns()
);
}
}

View File

@ -1,24 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.ParquetWriter;
import nu.marginalia.model.processed.DocumentRecord;
import java.io.IOException;
import java.nio.file.Path;
public class DocumentRecordParquetFileWriter implements AutoCloseable {
private final ParquetWriter<DocumentRecord> writer;
public DocumentRecordParquetFileWriter(Path file) throws IOException {
writer = ParquetWriter.writeFile(DocumentRecord.schema,
file.toFile(), DocumentRecord.newDehydrator());
}
public void write(DocumentRecord documentRecord) throws IOException {
writer.write(documentRecord);
}
public void close() throws IOException {
writer.close();
}
}

View File

@ -1,30 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader;
import nu.marginalia.model.processed.DomainLinkRecord;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DomainLinkRecordParquetFileReader {
@NotNull
public static Stream<DomainLinkRecord> stream(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainLinkRecord.newHydrator()));
}
@NotNull
public static Set<String> getDestDomainNames(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainLinkRecord.newDestDomainHydrator()),
List.of("dest"))
.collect(Collectors.toSet());
}
}

View File

@ -1,24 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.ParquetWriter;
import nu.marginalia.model.processed.DomainLinkRecord;
import java.io.IOException;
import java.nio.file.Path;
public class DomainLinkRecordParquetFileWriter implements AutoCloseable {
private final ParquetWriter<DomainLinkRecord> writer;
public DomainLinkRecordParquetFileWriter(Path file) throws IOException {
writer = ParquetWriter.writeFile(DomainLinkRecord.schema,
file.toFile(), DomainLinkRecord.newDehydrator());
}
public void write(DomainLinkRecord domainData) throws IOException {
writer.write(domainData);
}
public void close() throws IOException {
writer.close();
}
}

View File

@ -1,31 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader;
import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.model.processed.DomainWithIp;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Stream;
public class DomainRecordParquetFileReader {
@NotNull
public static Stream<DomainRecord> stream(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainRecord.newHydrator()));
}
@NotNull
public static List<DomainWithIp> getBasicDomainInformation(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()),
List.of("domain", "ip"))
.toList();
}
}

View File

@ -1,24 +0,0 @@
package nu.marginalia.io.processed;
import blue.strategic.parquet.ParquetWriter;
import nu.marginalia.model.processed.DomainRecord;
import java.io.IOException;
import java.nio.file.Path;
public class DomainRecordParquetFileWriter implements AutoCloseable {
private final ParquetWriter<DomainRecord> writer;
public DomainRecordParquetFileWriter(Path file) throws IOException {
writer = ParquetWriter.writeFile(DomainRecord.schema,
file.toFile(), DomainRecord.newDehydrator());
}
public void write(DomainRecord domainData) throws IOException {
writer.write(domainData);
}
public void close() throws IOException {
writer.close();
}
}

Some files were not shown because too many files have changed in this diff Show More