2023-09-14 08:11:57 +00:00
|
|
|
package nu.marginalia.loading;
|
2023-03-04 12:19:01 +00:00
|
|
|
|
|
|
|
import com.google.inject.Inject;
|
|
|
|
import com.google.inject.Singleton;
|
2023-08-01 13:00:15 +00:00
|
|
|
import lombok.SneakyThrows;
|
2023-10-14 10:07:40 +00:00
|
|
|
import nu.marginalia.IndexLocations;
|
|
|
|
import nu.marginalia.storage.FileStorageService;
|
2023-08-28 10:58:18 +00:00
|
|
|
import nu.marginalia.hash.MurmurHash3_128;
|
2023-03-04 12:19:01 +00:00
|
|
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
|
|
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
2023-08-28 10:58:18 +00:00
|
|
|
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
2023-03-04 12:19:01 +00:00
|
|
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
2023-03-16 20:35:54 +00:00
|
|
|
import nu.marginalia.keyword.model.DocumentKeywords;
|
2023-03-06 17:32:13 +00:00
|
|
|
import nu.marginalia.model.idx.DocumentMetadata;
|
2024-02-15 09:51:49 +00:00
|
|
|
import nu.marginalia.index.journal.IndexJournalFileNames;
|
2023-03-04 12:19:01 +00:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
2023-07-17 19:20:31 +00:00
|
|
|
import java.nio.file.Files;
|
2023-07-14 15:08:10 +00:00
|
|
|
import java.sql.SQLException;
|
2023-08-28 10:58:18 +00:00
|
|
|
|
|
|
|
import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH;
|
2023-03-04 12:19:01 +00:00
|
|
|
|
|
|
|
@Singleton
|
|
|
|
public class LoaderIndexJournalWriter {
|
|
|
|
|
|
|
|
private final IndexJournalWriter indexWriter;
|
|
|
|
private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class);
|
|
|
|
|
2023-09-01 11:52:00 +00:00
|
|
|
private final MurmurHash3_128 hasher = new MurmurHash3_128();
|
|
|
|
private final long[] buffer = new long[MAX_LENGTH * 2];
|
|
|
|
|
|
|
|
|
2023-03-04 12:19:01 +00:00
|
|
|
@Inject
|
2023-07-14 15:08:10 +00:00
|
|
|
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException {
|
2023-10-14 10:07:40 +00:00
|
|
|
var indexArea = IndexLocations.getIndexConstructionArea(fileStorageService);
|
2023-03-04 12:19:01 +00:00
|
|
|
|
2023-10-14 10:07:40 +00:00
|
|
|
var existingIndexFiles = IndexJournalFileNames.findJournalFiles(indexArea);
|
2023-08-28 10:58:18 +00:00
|
|
|
for (var existingFile : existingIndexFiles) {
|
|
|
|
Files.delete(existingFile);
|
|
|
|
}
|
2023-07-17 19:20:31 +00:00
|
|
|
|
2023-10-14 10:07:40 +00:00
|
|
|
indexWriter = new IndexJournalWriterPagingImpl(indexArea);
|
2023-03-04 12:19:01 +00:00
|
|
|
}
|
|
|
|
|
2023-08-24 09:55:58 +00:00
|
|
|
public void putWords(long combinedId,
|
2023-08-18 09:54:56 +00:00
|
|
|
int features,
|
2023-03-06 17:32:13 +00:00
|
|
|
DocumentMetadata metadata,
|
2023-03-04 12:19:01 +00:00
|
|
|
DocumentKeywords wordSet) {
|
2023-09-01 11:52:00 +00:00
|
|
|
|
2023-09-13 14:13:41 +00:00
|
|
|
putWords(combinedId, features, metadata.encode(), wordSet);
|
|
|
|
}
|
|
|
|
|
|
|
|
@SneakyThrows
|
|
|
|
public void putWords(long combinedId,
|
|
|
|
int features,
|
|
|
|
long metadata,
|
|
|
|
DocumentKeywords wordSet) {
|
|
|
|
|
2023-09-01 11:52:00 +00:00
|
|
|
if (wordSet.isEmpty()) {
|
2023-08-24 09:55:58 +00:00
|
|
|
logger.info("Skipping zero-length word set for {}", combinedId);
|
2023-03-04 12:19:01 +00:00
|
|
|
return;
|
2023-08-07 10:57:38 +00:00
|
|
|
}
|
2023-03-04 12:19:01 +00:00
|
|
|
|
2023-08-24 09:55:58 +00:00
|
|
|
if (combinedId <= 0) {
|
|
|
|
logger.warn("Bad ID: {}", combinedId);
|
2023-03-04 12:19:01 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-09-01 11:52:00 +00:00
|
|
|
var pointer = wordSet.newPointer();
|
2023-03-04 12:19:01 +00:00
|
|
|
|
2023-09-01 11:52:00 +00:00
|
|
|
while (pointer.hasMore()) {
|
|
|
|
int i = 0;
|
2023-03-04 12:19:01 +00:00
|
|
|
|
2023-09-01 11:52:00 +00:00
|
|
|
while (i < buffer.length
|
|
|
|
&& pointer.advancePointer())
|
|
|
|
{
|
2024-01-31 10:50:59 +00:00
|
|
|
final long hashedKeyword = hasher.hashKeyword(pointer.getKeyword());
|
2023-09-01 11:52:00 +00:00
|
|
|
|
|
|
|
buffer[i++] = hashedKeyword;
|
|
|
|
buffer[i++] = pointer.getMetadata();
|
2023-08-28 10:58:18 +00:00
|
|
|
}
|
2023-03-04 12:19:01 +00:00
|
|
|
|
2023-09-01 11:52:00 +00:00
|
|
|
var entry = new IndexJournalEntryData(i, buffer);
|
2023-09-13 14:13:41 +00:00
|
|
|
var header = new IndexJournalEntryHeader(combinedId, features, metadata);
|
2023-03-04 12:19:01 +00:00
|
|
|
|
2023-08-28 10:58:18 +00:00
|
|
|
indexWriter.put(header, entry);
|
2023-03-04 12:19:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
public void close() throws Exception {
|
|
|
|
indexWriter.close();
|
|
|
|
}
|
|
|
|
}
|