mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Anchor text capture work-in-progress
This commit is contained in:
parent
1068694db6
commit
35878c5102
@ -0,0 +1,37 @@
|
|||||||
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
|
||||||
|
public class DenseBitMap {
|
||||||
|
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
||||||
|
|
||||||
|
public final long cardinality;
|
||||||
|
private final ByteBuffer buffer;
|
||||||
|
|
||||||
|
public DenseBitMap(long cardinality) {
|
||||||
|
this.cardinality = cardinality;
|
||||||
|
|
||||||
|
boolean misaligned = (cardinality & 7) > 0;
|
||||||
|
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean get(long pos) {
|
||||||
|
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set the bit indexed by pos, returns
|
||||||
|
* its previous value.
|
||||||
|
*/
|
||||||
|
public boolean set(long pos) {
|
||||||
|
int offset = (int) (pos >>> 3);
|
||||||
|
int oldVal = buffer.get(offset);
|
||||||
|
int mask = (byte) 1 << (int) (pos & 7);
|
||||||
|
buffer.put(offset, (byte) (oldVal | mask));
|
||||||
|
return (oldVal & mask) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear(long pos) {
|
||||||
|
int offset = (int)(pos >>> 3);
|
||||||
|
buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7))));
|
||||||
|
}
|
||||||
|
}
|
@ -125,7 +125,7 @@ public class RandomWriteFunnel implements AutoCloseable {
|
|||||||
dest.putLong(addr, data);
|
dest.putLong(addr, data);
|
||||||
}
|
}
|
||||||
catch (IndexOutOfBoundsException ex) {
|
catch (IndexOutOfBoundsException ex) {
|
||||||
logger.info("!!!bad[{}]={}", addr, data);
|
logger.info("Bad poke[{}]={}, this happens if an RWF is allocated with insufficient size", addr, data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
buffer.compact();
|
buffer.compact();
|
||||||
|
@ -5,8 +5,6 @@ import nu.marginalia.util.btree.model.BTreeHeader;
|
|||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
import nu.marginalia.util.multimap.MultimapSearcher;
|
import nu.marginalia.util.multimap.MultimapSearcher;
|
||||||
|
|
||||||
import javax.annotation.CheckReturnValue;
|
|
||||||
|
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
public class BTreeReader {
|
public class BTreeReader {
|
||||||
@ -68,7 +66,7 @@ public class BTreeReader {
|
|||||||
for (int i = header.layers() - 1; i >= 0; --i) {
|
for (int i = header.layers() - 1; i >= 0; --i) {
|
||||||
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
|
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
|
||||||
|
|
||||||
final long nextLayerOffset = indexSearch(key, indexAddress + indexLayerBlockOffset, blockSize);
|
final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
|
||||||
if (nextLayerOffset < 0)
|
if (nextLayerOffset < 0)
|
||||||
return nextLayerOffset;
|
return nextLayerOffset;
|
||||||
|
|
||||||
@ -78,7 +76,7 @@ public class BTreeReader {
|
|||||||
return layerOffset;
|
return layerOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
private long indexSearch(long key, long start, long n) {
|
private long relativePositionInIndex(long key, long start, long n) {
|
||||||
return indexSearcher.binarySearchUpper(key, start, n) - start;
|
return indexSearcher.binarySearchUpper(key, start, n) - start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,15 +2,17 @@ package nu.marginalia.util.language.processing.model;
|
|||||||
|
|
||||||
|
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.lang.ref.SoftReference;
|
import java.lang.ref.SoftReference;
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.StringJoiner;
|
import java.util.StringJoiner;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
public class DocumentSentence {
|
public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||||
public final String originalSentence;
|
public final String originalSentence;
|
||||||
public final String[] words;
|
public final String[] words;
|
||||||
public final int[] separators;
|
public final int[] separators;
|
||||||
@ -85,4 +87,37 @@ public class DocumentSentence {
|
|||||||
public String toString() {
|
public String toString() {
|
||||||
return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" "));
|
return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" "));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public Iterator<SentencePos> iterator() {
|
||||||
|
return new Iterator<>() {
|
||||||
|
int i = -1;
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
return i+1 < length();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SentencePos next() {
|
||||||
|
return new SentencePos(++i);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public class SentencePos {
|
||||||
|
public final int pos;
|
||||||
|
|
||||||
|
public SentencePos(int pos) {
|
||||||
|
this.pos = pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String word() { return words[pos]; }
|
||||||
|
public String wordLowerCase() { return wordsLowerCase[pos]; }
|
||||||
|
public String posTag() { return posTags[pos]; }
|
||||||
|
public String stemmed() { return stemmedWords[pos]; }
|
||||||
|
public int separator() { return separators[pos]; }
|
||||||
|
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -52,13 +52,6 @@ public class ConverterMain {
|
|||||||
injector.getInstance(ConverterMain.class);
|
injector.getInstance(ConverterMain.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void requireArgs(String[] args, String... help) {
|
|
||||||
if (args.length != help.length) {
|
|
||||||
System.out.println("Usage: " + String.join(", ", help));
|
|
||||||
System.exit(255);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ConverterMain(
|
public ConverterMain(
|
||||||
EdgeCrawlPlan plan,
|
EdgeCrawlPlan plan,
|
||||||
@ -103,7 +96,8 @@ public class ConverterMain {
|
|||||||
|
|
||||||
domainToId.forEach((domain, id) -> {
|
domainToId.forEach((domain, id) -> {
|
||||||
String fileName = idToFileName.get(id);
|
String fileName = idToFileName.get(id);
|
||||||
Path dest = getFilePath(plan.crawl.getDir(), fileName);
|
Path dest = plan.getCrawledFilePath(fileName);
|
||||||
|
|
||||||
logger.info("{} - {} - {}", domain, id, dest);
|
logger.info("{} - {} - {}", domain, id, dest);
|
||||||
|
|
||||||
if (!processLog.isJobFinished(id)) {
|
if (!processLog.isJobFinished(id)) {
|
||||||
@ -128,10 +122,4 @@ public class ConverterMain {
|
|||||||
|
|
||||||
record ProcessingInstructions(String id, List<Instruction> instructions) {}
|
record ProcessingInstructions(String id, List<Instruction> instructions) {}
|
||||||
|
|
||||||
private Path getFilePath(Path dir, String fileName) {
|
|
||||||
String sp1 = fileName.substring(0, 2);
|
|
||||||
String sp2 = fileName.substring(2, 4);
|
|
||||||
return dir.resolve(sp1).resolve(sp2).resolve(fileName);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,194 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting;
|
||||||
|
|
||||||
|
import com.google.common.hash.HashFunction;
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Injector;
|
||||||
|
import nu.marginalia.util.DenseBitMap;
|
||||||
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class LinkKeywordExtractorMain {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
|
||||||
|
|
||||||
|
public static void main(String... args) throws IOException {
|
||||||
|
|
||||||
|
if (args.length != 1) {
|
||||||
|
System.err.println("Arguments: crawl-plan.yaml");
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
|
Injector injector = Guice.createInjector(
|
||||||
|
new ConverterModule(plan)
|
||||||
|
);
|
||||||
|
|
||||||
|
injector.getInstance(LinkKeywordExtractorMain.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final HashSet<String> crawledDomains = new HashSet<>();
|
||||||
|
private final List<String> fileNames = new ArrayList<>();
|
||||||
|
private final LinkParser linkParser = new LinkParser();
|
||||||
|
private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
|
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||||
|
|
||||||
|
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
|
||||||
|
// false positives are expected, but that's an acceptable trade-off to not have to deal with
|
||||||
|
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||||
|
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
|
||||||
|
logger.info("Loading input spec");
|
||||||
|
|
||||||
|
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||||
|
spec -> crawledDomains.add(spec.domain));
|
||||||
|
|
||||||
|
logger.info("Replaying crawl log");
|
||||||
|
WorkLog.readLog(plan.crawl.getLogFile(),
|
||||||
|
entry -> fileNames.add(entry.path()));
|
||||||
|
|
||||||
|
logger.info("Reading files");
|
||||||
|
for (var fn : fileNames) {
|
||||||
|
CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
|
||||||
|
var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
|
||||||
|
if (crawledDomain.doc == null) continue;
|
||||||
|
|
||||||
|
System.out.println("# " + crawledDomain.domain);
|
||||||
|
|
||||||
|
for (var doc : crawledDomain.doc) {
|
||||||
|
try {
|
||||||
|
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
||||||
|
processDocument(doc.url, doc.documentBody);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (URISyntaxException ex) {
|
||||||
|
// This Shouldn't Happen (TM) as the URL that we're failing to process
|
||||||
|
// is expected to have already been parsed by this code successfully
|
||||||
|
// in the process of getting here.
|
||||||
|
//
|
||||||
|
// But also, if it does happen, it's no big deal
|
||||||
|
|
||||||
|
logger.warn("Bad URL format", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
|
||||||
|
|
||||||
|
private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
|
||||||
|
var processed = Jsoup.parse(documentBody);
|
||||||
|
|
||||||
|
EdgeUrl documentUrl = new EdgeUrl(docUrl);
|
||||||
|
|
||||||
|
for (var link : processed.getElementsByTag("a")) {
|
||||||
|
if (link.hasAttr("href")) {
|
||||||
|
String href = link.attr("href");
|
||||||
|
String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim();
|
||||||
|
|
||||||
|
processAnchor(documentUrl, href, text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
|
||||||
|
if (!isInterestingAnchorText(text)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var optLinkUrl = linkParser.parseLink(documentUrl, href);
|
||||||
|
if (optLinkUrl.isEmpty()) return;
|
||||||
|
|
||||||
|
var linkUrl = optLinkUrl.get();
|
||||||
|
|
||||||
|
if (!isInterestingAnchorLink(linkUrl)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
|
||||||
|
for (var sent : languageData.sentences) {
|
||||||
|
for (var wordPos : sent) {
|
||||||
|
if (wordPos.isStopWord())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
String word = wordPos.wordLowerCase();
|
||||||
|
if (!WordPatterns.wordQualitiesPredicate.test(word) || !WordPatterns.filter(word))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
|
||||||
|
if (!linkUrl.domain.equals(documentUrl.domain)) {
|
||||||
|
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
||||||
|
System.out.println(linkUrl + "\t" + word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
|
||||||
|
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
|
||||||
|
|
||||||
|
private boolean isInterestingAnchorText(String text) {
|
||||||
|
if (text.isBlank()) return false;
|
||||||
|
if (text.length() > 32) return false;
|
||||||
|
|
||||||
|
// Google loves questions, and so does SEO spammers
|
||||||
|
if (text.endsWith("?")) return false;
|
||||||
|
|
||||||
|
if (text.startsWith("http:") || text.startsWith("https:")) return false;
|
||||||
|
|
||||||
|
if (looksLikeAnURL.test(text)) return false;
|
||||||
|
|
||||||
|
return switch (text) {
|
||||||
|
case "this", "here", "click", "click here", "download", "source" -> false;
|
||||||
|
default -> true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
|
||||||
|
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return crawledDomains.contains(linkUrl.domain.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isNewKeywordForLink(String href, String text) {
|
||||||
|
long hash = 0;
|
||||||
|
|
||||||
|
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
|
||||||
|
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
|
||||||
|
|
||||||
|
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
|
||||||
|
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
|
||||||
|
|
||||||
|
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
|
||||||
|
}
|
||||||
|
}
|
@ -145,13 +145,8 @@ public class LinkParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private boolean isRelRelevant(String rel) {
|
private boolean isRelRelevant(String rel) {
|
||||||
if (null == rel) {
|
// this is null safe
|
||||||
return true;
|
return !"noindex".equalsIgnoreCase(rel);
|
||||||
}
|
|
||||||
return switch (rel) {
|
|
||||||
case "noindex" -> false;
|
|
||||||
default -> true;
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isUrlRelevant(String href) {
|
private boolean isUrlRelevant(String href) {
|
||||||
|
@ -91,18 +91,13 @@ public class SearchIndexConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader,
|
private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader,
|
||||||
File outputFileWords) throws IOException
|
File outputFileWords) throws IOException
|
||||||
{
|
{
|
||||||
final int topWord = (int) journalReader.fileHeader.wordCount();
|
final int topWord = (int) journalReader.fileHeader.wordCount();
|
||||||
|
|
||||||
logger.debug("Table size = {}", topWord);
|
|
||||||
WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord);
|
WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord);
|
||||||
|
|
||||||
logger.debug("Reading words");
|
|
||||||
|
|
||||||
for (var entry : journalReader) {
|
for (var entry : journalReader) {
|
||||||
if (!isRelevantEntry(entry)) {
|
if (!isRelevantEntry(entry)) {
|
||||||
continue;
|
continue;
|
||||||
@ -119,8 +114,6 @@ public class SearchIndexConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.debug("Rearranging table");
|
|
||||||
|
|
||||||
wordsTableWriter.write(outputFileWords);
|
wordsTableWriter.write(outputFileWords);
|
||||||
|
|
||||||
return wordsTableWriter.getTable();
|
return wordsTableWriter.getTable();
|
||||||
@ -130,15 +123,12 @@ public class SearchIndexConverter {
|
|||||||
Path tmpUrlsFile,
|
Path tmpUrlsFile,
|
||||||
WordIndexOffsetsTable wordOffsetsTable) throws IOException
|
WordIndexOffsetsTable wordOffsetsTable) throws IOException
|
||||||
{
|
{
|
||||||
logger.info("Table size = {}", wordOffsetsTable.length());
|
|
||||||
|
|
||||||
long numberOfWordsTotal = 0;
|
long numberOfWordsTotal = 0;
|
||||||
for (var entry : journalReader) {
|
for (var entry : journalReader) {
|
||||||
if (isRelevantEntry(entry))
|
if (isRelevantEntry(entry))
|
||||||
numberOfWordsTotal += entry.wordCount();
|
numberOfWordsTotal += entry.wordCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
||||||
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
|
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
|
||||||
|
|
||||||
@ -168,7 +158,6 @@ public class SearchIndexConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
rwf.write(urlsTmpFileChannel);
|
rwf.write(urlsTmpFileChannel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -176,8 +165,6 @@ public class SearchIndexConverter {
|
|||||||
|
|
||||||
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
|
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
|
||||||
if (wordOffsetsTable.length() > 0) {
|
if (wordOffsetsTable.length() > 0) {
|
||||||
logger.info("Sorting urls table");
|
|
||||||
|
|
||||||
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
|
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
|
||||||
|
|
||||||
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
|
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
|
||||||
@ -188,7 +175,6 @@ public class SearchIndexConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Writing BTree");
|
|
||||||
try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) {
|
try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) {
|
||||||
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
|
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
|
||||||
|
|
||||||
@ -206,7 +192,6 @@ public class SearchIndexConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private long translateUrl(long url) {
|
private long translateUrl(long url) {
|
||||||
int domainId = partitioner.translateId(bucketId, (int) (url >>> 32));
|
int domainId = partitioner.translateId(bucketId, (int) (url >>> 32));
|
||||||
return ((long)domainId << 32) | (url & 0xFFFFFFFFL);
|
return ((long)domainId << 32) | (url & 0xFFFFFFFFL);
|
||||||
|
@ -43,11 +43,11 @@ public class WordsTableWriter {
|
|||||||
|
|
||||||
var writer = new BTreeWriter(mmf, wordsBTreeContext);
|
var writer = new BTreeWriter(mmf, wordsBTreeContext);
|
||||||
|
|
||||||
writer.write(offset, tableSize, this::writeBTreeBlock);
|
writer.write(offset, tableSize, this::writeBTreeDataBlock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeBTreeBlock(MultimapFileLongSlice mapSlice) {
|
private void writeBTreeDataBlock(MultimapFileLongSlice mapSlice) {
|
||||||
long urlFileOffset = 0;
|
long urlFileOffset = 0;
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
|
|
||||||
|
@ -27,4 +27,15 @@ public class EdgeCrawlPlan {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Path getCrawledFilePath(String fileName) {
|
||||||
|
String sp1 = fileName.substring(0, 2);
|
||||||
|
String sp2 = fileName.substring(2, 4);
|
||||||
|
return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Path getProcessedFilePath(String fileName) {
|
||||||
|
String sp1 = fileName.substring(0, 2);
|
||||||
|
String sp2 = fileName.substring(2, 4);
|
||||||
|
return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,56 @@
|
|||||||
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class DenseBitMapTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSetAll() {
|
||||||
|
var dbm = new DenseBitMap(129);
|
||||||
|
for (int i = 0; i < dbm.cardinality; i++) {
|
||||||
|
dbm.set(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < dbm.cardinality; i++) {
|
||||||
|
assertTrue(dbm.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSetEven() {
|
||||||
|
var dbm = new DenseBitMap(131);
|
||||||
|
for (int i = 0; i < dbm.cardinality; i+=2) {
|
||||||
|
dbm.set(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < dbm.cardinality; i+=2) {
|
||||||
|
assertTrue(dbm.get(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 1; i < dbm.cardinality; i+=2) {
|
||||||
|
assertFalse(dbm.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSetAllClearSome() {
|
||||||
|
var dbm = new DenseBitMap(129);
|
||||||
|
|
||||||
|
for (int i = 0; i < dbm.cardinality; i++) {
|
||||||
|
dbm.set(i);
|
||||||
|
}
|
||||||
|
for (int i = 1; i < dbm.cardinality; i+=2) {
|
||||||
|
dbm.clear(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < dbm.cardinality; i+=2) {
|
||||||
|
assertTrue(dbm.get(i), "Expected " + i + " to be set");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 1; i < dbm.cardinality; i+=2) {
|
||||||
|
assertFalse(dbm.get(i), "Expected " + i + " to be clear");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user