MarginaliaSearch/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java

97 lines
3.0 KiB
Java
Raw Normal View History

package nu.marginalia.loading;
2023-03-04 12:19:01 +00:00
import com.google.inject.Inject;
import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.IndexLocations;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128;
2023-03-04 12:19:01 +00:00
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
2023-03-04 12:19:01 +00:00
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.keyword.model.DocumentKeywords;
2023-03-06 17:32:13 +00:00
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.index.journal.IndexJournalFileNames;
2023-03-04 12:19:01 +00:00
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
2023-07-14 15:08:10 +00:00
import java.sql.SQLException;
import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH;
2023-03-04 12:19:01 +00:00
@Singleton
public class LoaderIndexJournalWriter {
private final IndexJournalWriter indexWriter;
private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class);
private final MurmurHash3_128 hasher = new MurmurHash3_128();
private final long[] buffer = new long[MAX_LENGTH * 2];
2023-03-04 12:19:01 +00:00
@Inject
2023-07-14 15:08:10 +00:00
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException {
var indexArea = IndexLocations.getIndexConstructionArea(fileStorageService);
2023-03-04 12:19:01 +00:00
var existingIndexFiles = IndexJournalFileNames.findJournalFiles(indexArea);
for (var existingFile : existingIndexFiles) {
Files.delete(existingFile);
}
indexWriter = new IndexJournalWriterPagingImpl(indexArea);
2023-03-04 12:19:01 +00:00
}
public void putWords(long combinedId,
int features,
2023-03-06 17:32:13 +00:00
DocumentMetadata metadata,
2023-03-04 12:19:01 +00:00
DocumentKeywords wordSet) {
putWords(combinedId, features, metadata.encode(), wordSet);
}
@SneakyThrows
public void putWords(long combinedId,
int features,
long metadata,
DocumentKeywords wordSet) {
if (wordSet.isEmpty()) {
logger.info("Skipping zero-length word set for {}", combinedId);
2023-03-04 12:19:01 +00:00
return;
2023-08-07 10:57:38 +00:00
}
2023-03-04 12:19:01 +00:00
if (combinedId <= 0) {
logger.warn("Bad ID: {}", combinedId);
2023-03-04 12:19:01 +00:00
return;
}
var pointer = wordSet.newPointer();
2023-03-04 12:19:01 +00:00
while (pointer.hasMore()) {
int i = 0;
2023-03-04 12:19:01 +00:00
while (i < buffer.length
&& pointer.advancePointer())
{
final long hashedKeyword = hasher.hashKeyword(pointer.getKeyword());
buffer[i++] = hashedKeyword;
buffer[i++] = pointer.getMetadata();
}
2023-03-04 12:19:01 +00:00
var entry = new IndexJournalEntryData(i, buffer);
var header = new IndexJournalEntryHeader(combinedId, features, metadata);
2023-03-04 12:19:01 +00:00
indexWriter.put(header, entry);
2023-03-04 12:19:01 +00:00
}
}
public void close() throws Exception {
indexWriter.close();
}
}