Merge pull request 'Fix buggy madvise code, clean up preconverter' (#39) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/39
This commit is contained in:
Viktor Lofgren 2022-07-26 14:12:42 +02:00
commit 22abcc921e
4 changed files with 18 additions and 27 deletions

View File

@ -124,14 +124,22 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
if (endLongs >= mappedSize)
grow(endLongs);
var buff = mappedByteBuffers.get((int)(startLongs / bufferSize));
if ((int)(startLongs / bufferSize) != (int)((endLongs) / bufferSize)) {
logger.debug("Misaligned madvise, skipping");
return;
int startIdx = (int)(startLongs / bufferSize);
int endIdx = (int)(endLongs / bufferSize);
if (startIdx != endIdx) {
long offsetStart = (startLongs % bufferSize) * WORD_SIZE;
NativeIO.madviseRange(mappedByteBuffers.get(startIdx), advice, offsetStart, (int) (bufferSize * WORD_SIZE - offsetStart));
for (int i = startIdx+1; i < endIdx; i++) {
NativeIO.madviseRange(mappedByteBuffers.get(i), advice, 0, (int)(bufferSize * WORD_SIZE));
}
NativeIO.madviseRange(mappedByteBuffers.get(endIdx), advice, 0, (int)((endIdx % bufferSize) * WORD_SIZE));
}
else {
var buff = mappedByteBuffers.get(startIdx);
NativeIO.madviseRange(buff, advice, (startLongs % bufferSize) * WORD_SIZE, (int) (lengthLongs * WORD_SIZE));
}
NativeIO.madviseRange(buff, advice, (startLongs % bufferSize) * WORD_SIZE, (int)(lengthLongs*WORD_SIZE));
}
public void pokeRange(long offset, long length) {

View File

@ -17,19 +17,7 @@ import java.nio.file.Files;
import java.util.Objects;
public class SearchIndexPreconverter {
private static final int CHUNK_HEADER_SIZE = 16;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final SearchIndexPartitioner partitioner;
private final TIntHashSet spamDomains;
@SneakyThrows
public static long wordCount(File inputFile) {
try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) {
raf.readLong();
return raf.readInt();
}
}
@SneakyThrows
@Inject
@ -38,8 +26,7 @@ public class SearchIndexPreconverter {
SearchIndexPartitioner partitioner,
EdgeDomainBlacklist blacklist)
{
this.partitioner = partitioner;
this.spamDomains = blacklist.getSpamDomains();
TIntHashSet spamDomains = blacklist.getSpamDomains();
logger.info("Preconverting {}", inputFile);
for (File f : outputFiles) {
@ -64,12 +51,10 @@ public class SearchIndexPreconverter {
fileChannels[i] = randomAccessFiles[i].getChannel();
}
var lock = partitioner.getReadLock();
try {
lock.lock();
ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
for (var entry : indexJournalReader) {
if (!partitioner.isGoodUrl(entry.urlId())
|| spamDomains.contains(entry.domainId())) {
@ -93,6 +78,7 @@ public class SearchIndexPreconverter {
finally {
lock.unlock();
}
logger.info("Finalizing preconversion");
for (int i = 0; i < randomAccessFiles.length; i++) {
long pos = randomAccessFiles[i].getFilePointer();

View File

@ -11,6 +11,7 @@ public enum IndexBlock {
Meta(7, 7),
PositionWords(8, 4.5),
NamesWords(9, 5),
Unused(10, 10),
Topic(11, 0.5);
public final int id;
@ -19,7 +20,6 @@ public enum IndexBlock {
IndexBlock(int id, double sortOrder) {
this.sortOrder = sortOrder;
this.id = id;
}
public static IndexBlock byId(int id) {

View File

@ -46,17 +46,14 @@ public class SearchIndex implements AutoCloseable {
private void madvise(MultimapFileLong urls, BTreeReader reader) {
urls.advice(NativeIO.Advice.Random);
words.forEachWordsOffset(offset -> {
var h = reader.getHeader(offset);
long length = h.dataOffsetLongs() - h.indexOffsetLongs();
urls.adviceRange(NativeIO.Advice.Normal, offset, 512);
urls.adviceRange(NativeIO.Advice.WillNeed, offset, 512);
if (length > 0) {
urls.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length);
urls.adviceRange(NativeIO.Advice.Normal, h.dataOffsetLongs(), Math.min(2048, h.numEntries()*bTreeReader.ctx.entrySize()));
urls.pokeRange(h.indexOffsetLongs(), length);
}
});
}