mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Anchor text capture work-in-progress
This commit is contained in:
parent
1068694db6
commit
35878c5102
@ -0,0 +1,37 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
public class DenseBitMap {
|
||||
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
||||
|
||||
public final long cardinality;
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
public DenseBitMap(long cardinality) {
|
||||
this.cardinality = cardinality;
|
||||
|
||||
boolean misaligned = (cardinality & 7) > 0;
|
||||
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
||||
}
|
||||
|
||||
public boolean get(long pos) {
|
||||
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
||||
}
|
||||
|
||||
/** Set the bit indexed by pos, returns
|
||||
* its previous value.
|
||||
*/
|
||||
public boolean set(long pos) {
|
||||
int offset = (int) (pos >>> 3);
|
||||
int oldVal = buffer.get(offset);
|
||||
int mask = (byte) 1 << (int) (pos & 7);
|
||||
buffer.put(offset, (byte) (oldVal | mask));
|
||||
return (oldVal & mask) != 0;
|
||||
}
|
||||
|
||||
public void clear(long pos) {
|
||||
int offset = (int)(pos >>> 3);
|
||||
buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7))));
|
||||
}
|
||||
}
|
@ -125,7 +125,7 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
dest.putLong(addr, data);
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
logger.info("!!!bad[{}]={}", addr, data);
|
||||
logger.info("Bad poke[{}]={}, this happens if an RWF is allocated with insufficient size", addr, data);
|
||||
}
|
||||
}
|
||||
buffer.compact();
|
||||
|
@ -5,8 +5,6 @@ import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapSearcher;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class BTreeReader {
|
||||
@ -68,7 +66,7 @@ public class BTreeReader {
|
||||
for (int i = header.layers() - 1; i >= 0; --i) {
|
||||
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
|
||||
|
||||
final long nextLayerOffset = indexSearch(key, indexAddress + indexLayerBlockOffset, blockSize);
|
||||
final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
|
||||
if (nextLayerOffset < 0)
|
||||
return nextLayerOffset;
|
||||
|
||||
@ -78,7 +76,7 @@ public class BTreeReader {
|
||||
return layerOffset;
|
||||
}
|
||||
|
||||
private long indexSearch(long key, long start, long n) {
|
||||
private long relativePositionInIndex(long key, long start, long n) {
|
||||
return indexSearcher.binarySearchUpper(key, start, n) - start;
|
||||
}
|
||||
|
||||
|
@ -2,15 +2,17 @@ package nu.marginalia.util.language.processing.model;
|
||||
|
||||
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.lang.ref.SoftReference;
|
||||
import java.util.BitSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.StringJoiner;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class DocumentSentence {
|
||||
public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
public final String originalSentence;
|
||||
public final String[] words;
|
||||
public final int[] separators;
|
||||
@ -85,4 +87,37 @@ public class DocumentSentence {
|
||||
public String toString() {
|
||||
return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" "));
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<SentencePos> iterator() {
|
||||
return new Iterator<>() {
|
||||
int i = -1;
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i+1 < length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SentencePos next() {
|
||||
return new SentencePos(++i);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public class SentencePos {
|
||||
public final int pos;
|
||||
|
||||
public SentencePos(int pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
public String word() { return words[pos]; }
|
||||
public String wordLowerCase() { return wordsLowerCase[pos]; }
|
||||
public String posTag() { return posTags[pos]; }
|
||||
public String stemmed() { return stemmedWords[pos]; }
|
||||
public int separator() { return separators[pos]; }
|
||||
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -52,13 +52,6 @@ public class ConverterMain {
|
||||
injector.getInstance(ConverterMain.class);
|
||||
}
|
||||
|
||||
private static void requireArgs(String[] args, String... help) {
|
||||
if (args.length != help.length) {
|
||||
System.out.println("Usage: " + String.join(", ", help));
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ConverterMain(
|
||||
EdgeCrawlPlan plan,
|
||||
@ -103,7 +96,8 @@ public class ConverterMain {
|
||||
|
||||
domainToId.forEach((domain, id) -> {
|
||||
String fileName = idToFileName.get(id);
|
||||
Path dest = getFilePath(plan.crawl.getDir(), fileName);
|
||||
Path dest = plan.getCrawledFilePath(fileName);
|
||||
|
||||
logger.info("{} - {} - {}", domain, id, dest);
|
||||
|
||||
if (!processLog.isJobFinished(id)) {
|
||||
@ -128,10 +122,4 @@ public class ConverterMain {
|
||||
|
||||
record ProcessingInstructions(String id, List<Instruction> instructions) {}
|
||||
|
||||
private Path getFilePath(Path dir, String fileName) {
|
||||
String sp1 = fileName.substring(0, 2);
|
||||
String sp2 = fileName.substring(2, 4);
|
||||
return dir.resolve(sp1).resolve(sp2).resolve(fileName);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,194 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.util.DenseBitMap;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class LinkKeywordExtractorMain {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
|
||||
if (args.length != 1) {
|
||||
System.err.println("Arguments: crawl-plan.yaml");
|
||||
System.exit(0);
|
||||
}
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConverterModule(plan)
|
||||
);
|
||||
|
||||
injector.getInstance(LinkKeywordExtractorMain.class);
|
||||
}
|
||||
|
||||
private final HashSet<String> crawledDomains = new HashSet<>();
|
||||
private final List<String> fileNames = new ArrayList<>();
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||
|
||||
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
|
||||
// false positives are expected, but that's an acceptable trade-off to not have to deal with
|
||||
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||
|
||||
@Inject
|
||||
public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
|
||||
logger.info("Loading input spec");
|
||||
|
||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||
spec -> crawledDomains.add(spec.domain));
|
||||
|
||||
logger.info("Replaying crawl log");
|
||||
WorkLog.readLog(plan.crawl.getLogFile(),
|
||||
entry -> fileNames.add(entry.path()));
|
||||
|
||||
logger.info("Reading files");
|
||||
for (var fn : fileNames) {
|
||||
CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
|
||||
var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
|
||||
if (crawledDomain.doc == null) continue;
|
||||
|
||||
System.out.println("# " + crawledDomain.domain);
|
||||
|
||||
for (var doc : crawledDomain.doc) {
|
||||
try {
|
||||
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
||||
processDocument(doc.url, doc.documentBody);
|
||||
}
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
// This Shouldn't Happen (TM) as the URL that we're failing to process
|
||||
// is expected to have already been parsed by this code successfully
|
||||
// in the process of getting here.
|
||||
//
|
||||
// But also, if it does happen, it's no big deal
|
||||
|
||||
logger.warn("Bad URL format", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
|
||||
|
||||
private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
|
||||
var processed = Jsoup.parse(documentBody);
|
||||
|
||||
EdgeUrl documentUrl = new EdgeUrl(docUrl);
|
||||
|
||||
for (var link : processed.getElementsByTag("a")) {
|
||||
if (link.hasAttr("href")) {
|
||||
String href = link.attr("href");
|
||||
String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim();
|
||||
|
||||
processAnchor(documentUrl, href, text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
|
||||
if (!isInterestingAnchorText(text)) {
|
||||
return;
|
||||
}
|
||||
|
||||
var optLinkUrl = linkParser.parseLink(documentUrl, href);
|
||||
if (optLinkUrl.isEmpty()) return;
|
||||
|
||||
var linkUrl = optLinkUrl.get();
|
||||
|
||||
if (!isInterestingAnchorLink(linkUrl)) {
|
||||
return;
|
||||
}
|
||||
|
||||
DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
|
||||
for (var sent : languageData.sentences) {
|
||||
for (var wordPos : sent) {
|
||||
if (wordPos.isStopWord())
|
||||
continue;
|
||||
|
||||
String word = wordPos.wordLowerCase();
|
||||
if (!WordPatterns.wordQualitiesPredicate.test(word) || !WordPatterns.filter(word))
|
||||
continue;
|
||||
|
||||
|
||||
if (!linkUrl.domain.equals(documentUrl.domain)) {
|
||||
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
||||
System.out.println(linkUrl + "\t" + word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
|
||||
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
|
||||
|
||||
private boolean isInterestingAnchorText(String text) {
|
||||
if (text.isBlank()) return false;
|
||||
if (text.length() > 32) return false;
|
||||
|
||||
// Google loves questions, and so does SEO spammers
|
||||
if (text.endsWith("?")) return false;
|
||||
|
||||
if (text.startsWith("http:") || text.startsWith("https:")) return false;
|
||||
|
||||
if (looksLikeAnURL.test(text)) return false;
|
||||
|
||||
return switch (text) {
|
||||
case "this", "here", "click", "click here", "download", "source" -> false;
|
||||
default -> true;
|
||||
};
|
||||
}
|
||||
|
||||
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
|
||||
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return crawledDomains.contains(linkUrl.domain.toString());
|
||||
}
|
||||
|
||||
private boolean isNewKeywordForLink(String href, String text) {
|
||||
long hash = 0;
|
||||
|
||||
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
|
||||
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
|
||||
|
||||
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
|
||||
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
|
||||
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
|
||||
}
|
||||
}
|
@ -145,13 +145,8 @@ public class LinkParser {
|
||||
}
|
||||
|
||||
private boolean isRelRelevant(String rel) {
|
||||
if (null == rel) {
|
||||
return true;
|
||||
}
|
||||
return switch (rel) {
|
||||
case "noindex" -> false;
|
||||
default -> true;
|
||||
};
|
||||
// this is null safe
|
||||
return !"noindex".equalsIgnoreCase(rel);
|
||||
}
|
||||
|
||||
private boolean isUrlRelevant(String href) {
|
||||
|
@ -91,18 +91,13 @@ public class SearchIndexConverter {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader,
|
||||
File outputFileWords) throws IOException
|
||||
{
|
||||
final int topWord = (int) journalReader.fileHeader.wordCount();
|
||||
|
||||
logger.debug("Table size = {}", topWord);
|
||||
WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord);
|
||||
|
||||
logger.debug("Reading words");
|
||||
|
||||
for (var entry : journalReader) {
|
||||
if (!isRelevantEntry(entry)) {
|
||||
continue;
|
||||
@ -119,8 +114,6 @@ public class SearchIndexConverter {
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug("Rearranging table");
|
||||
|
||||
wordsTableWriter.write(outputFileWords);
|
||||
|
||||
return wordsTableWriter.getTable();
|
||||
@ -130,15 +123,12 @@ public class SearchIndexConverter {
|
||||
Path tmpUrlsFile,
|
||||
WordIndexOffsetsTable wordOffsetsTable) throws IOException
|
||||
{
|
||||
logger.info("Table size = {}", wordOffsetsTable.length());
|
||||
|
||||
long numberOfWordsTotal = 0;
|
||||
for (var entry : journalReader) {
|
||||
if (isRelevantEntry(entry))
|
||||
numberOfWordsTotal += entry.wordCount();
|
||||
}
|
||||
|
||||
|
||||
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
||||
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
|
||||
|
||||
@ -168,7 +158,6 @@ public class SearchIndexConverter {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
rwf.write(urlsTmpFileChannel);
|
||||
}
|
||||
|
||||
@ -176,8 +165,6 @@ public class SearchIndexConverter {
|
||||
|
||||
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
|
||||
if (wordOffsetsTable.length() > 0) {
|
||||
logger.info("Sorting urls table");
|
||||
|
||||
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
|
||||
|
||||
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
|
||||
@ -188,7 +175,6 @@ public class SearchIndexConverter {
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Writing BTree");
|
||||
try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) {
|
||||
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
|
||||
|
||||
@ -206,7 +192,6 @@ public class SearchIndexConverter {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private long translateUrl(long url) {
|
||||
int domainId = partitioner.translateId(bucketId, (int) (url >>> 32));
|
||||
return ((long)domainId << 32) | (url & 0xFFFFFFFFL);
|
||||
|
@ -43,11 +43,11 @@ public class WordsTableWriter {
|
||||
|
||||
var writer = new BTreeWriter(mmf, wordsBTreeContext);
|
||||
|
||||
writer.write(offset, tableSize, this::writeBTreeBlock);
|
||||
writer.write(offset, tableSize, this::writeBTreeDataBlock);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeBTreeBlock(MultimapFileLongSlice mapSlice) {
|
||||
private void writeBTreeDataBlock(MultimapFileLongSlice mapSlice) {
|
||||
long urlFileOffset = 0;
|
||||
int idx = 0;
|
||||
|
||||
|
@ -27,4 +27,15 @@ public class EdgeCrawlPlan {
|
||||
}
|
||||
}
|
||||
|
||||
public Path getCrawledFilePath(String fileName) {
|
||||
String sp1 = fileName.substring(0, 2);
|
||||
String sp2 = fileName.substring(2, 4);
|
||||
return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
|
||||
}
|
||||
|
||||
public Path getProcessedFilePath(String fileName) {
|
||||
String sp1 = fileName.substring(0, 2);
|
||||
String sp2 = fileName.substring(2, 4);
|
||||
return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,56 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class DenseBitMapTest {
|
||||
|
||||
@Test
|
||||
public void testSetAll() {
|
||||
var dbm = new DenseBitMap(129);
|
||||
for (int i = 0; i < dbm.cardinality; i++) {
|
||||
dbm.set(i);
|
||||
}
|
||||
|
||||
for (int i = 0; i < dbm.cardinality; i++) {
|
||||
assertTrue(dbm.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetEven() {
|
||||
var dbm = new DenseBitMap(131);
|
||||
for (int i = 0; i < dbm.cardinality; i+=2) {
|
||||
dbm.set(i);
|
||||
}
|
||||
|
||||
for (int i = 0; i < dbm.cardinality; i+=2) {
|
||||
assertTrue(dbm.get(i));
|
||||
}
|
||||
|
||||
for (int i = 1; i < dbm.cardinality; i+=2) {
|
||||
assertFalse(dbm.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetAllClearSome() {
|
||||
var dbm = new DenseBitMap(129);
|
||||
|
||||
for (int i = 0; i < dbm.cardinality; i++) {
|
||||
dbm.set(i);
|
||||
}
|
||||
for (int i = 1; i < dbm.cardinality; i+=2) {
|
||||
dbm.clear(i);
|
||||
}
|
||||
|
||||
for (int i = 0; i < dbm.cardinality; i+=2) {
|
||||
assertTrue(dbm.get(i), "Expected " + i + " to be set");
|
||||
}
|
||||
|
||||
for (int i = 1; i < dbm.cardinality; i+=2) {
|
||||
assertFalse(dbm.get(i), "Expected " + i + " to be clear");
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user