Anchor text capture work-in-progress

This commit is contained in:
vlofgren 2022-06-22 12:57:58 +02:00
parent 1068694db6
commit 35878c5102
11 changed files with 343 additions and 44 deletions

View File

@ -0,0 +1,37 @@
package nu.marginalia.util;
import java.nio.ByteBuffer;
public class DenseBitMap {
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
public final long cardinality;
private final ByteBuffer buffer;
public DenseBitMap(long cardinality) {
this.cardinality = cardinality;
boolean misaligned = (cardinality & 7) > 0;
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
}
public boolean get(long pos) {
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
}
/** Set the bit indexed by pos, returns
* its previous value.
*/
public boolean set(long pos) {
int offset = (int) (pos >>> 3);
int oldVal = buffer.get(offset);
int mask = (byte) 1 << (int) (pos & 7);
buffer.put(offset, (byte) (oldVal | mask));
return (oldVal & mask) != 0;
}
public void clear(long pos) {
int offset = (int)(pos >>> 3);
buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7))));
}
}

View File

@ -125,7 +125,7 @@ public class RandomWriteFunnel implements AutoCloseable {
dest.putLong(addr, data);
}
catch (IndexOutOfBoundsException ex) {
logger.info("!!!bad[{}]={}", addr, data);
logger.info("Bad poke[{}]={}, this happens if an RWF is allocated with insufficient size", addr, data);
}
}
buffer.compact();

View File

@ -5,8 +5,6 @@ import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapSearcher;
import javax.annotation.CheckReturnValue;
import static java.lang.Math.min;
public class BTreeReader {
@ -68,7 +66,7 @@ public class BTreeReader {
for (int i = header.layers() - 1; i >= 0; --i) {
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
final long nextLayerOffset = indexSearch(key, indexAddress + indexLayerBlockOffset, blockSize);
final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
if (nextLayerOffset < 0)
return nextLayerOffset;
@ -78,7 +76,7 @@ public class BTreeReader {
return layerOffset;
}
private long indexSearch(long key, long start, long n) {
private long relativePositionInIndex(long key, long start, long n) {
return indexSearcher.binarySearchUpper(key, start, n) - start;
}

View File

@ -2,15 +2,17 @@ package nu.marginalia.util.language.processing.model;
import nu.marginalia.util.language.WordPatterns;
import org.jetbrains.annotations.NotNull;
import java.lang.ref.SoftReference;
import java.util.BitSet;
import java.util.Iterator;
import java.util.StringJoiner;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public class DocumentSentence {
public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
public final String originalSentence;
public final String[] words;
public final int[] separators;
@ -85,4 +87,37 @@ public class DocumentSentence {
public String toString() {
return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" "));
}
@NotNull
@Override
public Iterator<SentencePos> iterator() {
return new Iterator<>() {
int i = -1;
@Override
public boolean hasNext() {
return i+1 < length();
}
@Override
public SentencePos next() {
return new SentencePos(++i);
}
};
}
public class SentencePos {
public final int pos;
public SentencePos(int pos) {
this.pos = pos;
}
public String word() { return words[pos]; }
public String wordLowerCase() { return wordsLowerCase[pos]; }
public String posTag() { return posTags[pos]; }
public String stemmed() { return stemmedWords[pos]; }
public int separator() { return separators[pos]; }
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
}
}

View File

@ -52,13 +52,6 @@ public class ConverterMain {
injector.getInstance(ConverterMain.class);
}
private static void requireArgs(String[] args, String... help) {
if (args.length != help.length) {
System.out.println("Usage: " + String.join(", ", help));
System.exit(255);
}
}
@Inject
public ConverterMain(
EdgeCrawlPlan plan,
@ -103,7 +96,8 @@ public class ConverterMain {
domainToId.forEach((domain, id) -> {
String fileName = idToFileName.get(id);
Path dest = getFilePath(plan.crawl.getDir(), fileName);
Path dest = plan.getCrawledFilePath(fileName);
logger.info("{} - {} - {}", domain, id, dest);
if (!processLog.isJobFinished(id)) {
@ -128,10 +122,4 @@ public class ConverterMain {
record ProcessingInstructions(String id, List<Instruction> instructions) {}
private Path getFilePath(Path dir, String fileName) {
String sp1 = fileName.substring(0, 2);
String sp2 = fileName.substring(2, 4);
return dir.resolve(sp1).resolve(sp2).resolve(fileName);
}
}

View File

@ -0,0 +1,194 @@
package nu.marginalia.wmsa.edge.converting;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.util.DenseBitMap;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class LinkKeywordExtractorMain {
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
public static void main(String... args) throws IOException {
if (args.length != 1) {
System.err.println("Arguments: crawl-plan.yaml");
System.exit(0);
}
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
Injector injector = Guice.createInjector(
new ConverterModule(plan)
);
injector.getInstance(LinkKeywordExtractorMain.class);
}
private final HashSet<String> crawledDomains = new HashSet<>();
private final List<String> fileNames = new ArrayList<>();
private final LinkParser linkParser = new LinkParser();
private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
private final HashFunction hashFunction = Hashing.murmur3_128();
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
// false positives are expected, but that's an acceptable trade-off to not have to deal with
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
@Inject
public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
logger.info("Loading input spec");
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
spec -> crawledDomains.add(spec.domain));
logger.info("Replaying crawl log");
WorkLog.readLog(plan.crawl.getLogFile(),
entry -> fileNames.add(entry.path()));
logger.info("Reading files");
for (var fn : fileNames) {
CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
if (crawledDomain.doc == null) continue;
System.out.println("# " + crawledDomain.domain);
for (var doc : crawledDomain.doc) {
try {
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
processDocument(doc.url, doc.documentBody);
}
}
catch (URISyntaxException ex) {
// This Shouldn't Happen (TM) as the URL that we're failing to process
// is expected to have already been parsed by this code successfully
// in the process of getting here.
//
// But also, if it does happen, it's no big deal
logger.warn("Bad URL format", ex);
}
}
}
}
private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
var processed = Jsoup.parse(documentBody);
EdgeUrl documentUrl = new EdgeUrl(docUrl);
for (var link : processed.getElementsByTag("a")) {
if (link.hasAttr("href")) {
String href = link.attr("href");
String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim();
processAnchor(documentUrl, href, text);
}
}
}
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
if (!isInterestingAnchorText(text)) {
return;
}
var optLinkUrl = linkParser.parseLink(documentUrl, href);
if (optLinkUrl.isEmpty()) return;
var linkUrl = optLinkUrl.get();
if (!isInterestingAnchorLink(linkUrl)) {
return;
}
DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
for (var sent : languageData.sentences) {
for (var wordPos : sent) {
if (wordPos.isStopWord())
continue;
String word = wordPos.wordLowerCase();
if (!WordPatterns.wordQualitiesPredicate.test(word) || !WordPatterns.filter(word))
continue;
if (!linkUrl.domain.equals(documentUrl.domain)) {
if (isNewKeywordForLink(word, linkUrl.toString())) {
System.out.println(linkUrl + "\t" + word);
}
}
}
}
}
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
private boolean isInterestingAnchorText(String text) {
if (text.isBlank()) return false;
if (text.length() > 32) return false;
// Google loves questions, and so does SEO spammers
if (text.endsWith("?")) return false;
if (text.startsWith("http:") || text.startsWith("https:")) return false;
if (looksLikeAnURL.test(text)) return false;
return switch (text) {
case "this", "here", "click", "click here", "download", "source" -> false;
default -> true;
};
}
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
return false;
}
return crawledDomains.contains(linkUrl.domain.toString());
}
private boolean isNewKeywordForLink(String href, String text) {
long hash = 0;
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
}
}

View File

@ -145,13 +145,8 @@ public class LinkParser {
}
private boolean isRelRelevant(String rel) {
if (null == rel) {
return true;
}
return switch (rel) {
case "noindex" -> false;
default -> true;
};
// this is null safe
return !"noindex".equalsIgnoreCase(rel);
}
private boolean isUrlRelevant(String href) {

View File

@ -91,18 +91,13 @@ public class SearchIndexConverter {
}
}
private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader,
File outputFileWords) throws IOException
{
final int topWord = (int) journalReader.fileHeader.wordCount();
logger.debug("Table size = {}", topWord);
WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord);
logger.debug("Reading words");
for (var entry : journalReader) {
if (!isRelevantEntry(entry)) {
continue;
@ -119,8 +114,6 @@ public class SearchIndexConverter {
}
}
logger.debug("Rearranging table");
wordsTableWriter.write(outputFileWords);
return wordsTableWriter.getTable();
@ -130,15 +123,12 @@ public class SearchIndexConverter {
Path tmpUrlsFile,
WordIndexOffsetsTable wordOffsetsTable) throws IOException
{
logger.info("Table size = {}", wordOffsetsTable.length());
long numberOfWordsTotal = 0;
for (var entry : journalReader) {
if (isRelevantEntry(entry))
numberOfWordsTotal += entry.wordCount();
}
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
@ -168,7 +158,6 @@ public class SearchIndexConverter {
}
}
rwf.write(urlsTmpFileChannel);
}
@ -176,8 +165,6 @@ public class SearchIndexConverter {
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
if (wordOffsetsTable.length() > 0) {
logger.info("Sorting urls table");
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
@ -188,7 +175,6 @@ public class SearchIndexConverter {
}
}
logger.info("Writing BTree");
try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) {
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
@ -206,7 +192,6 @@ public class SearchIndexConverter {
}
}
private long translateUrl(long url) {
int domainId = partitioner.translateId(bucketId, (int) (url >>> 32));
return ((long)domainId << 32) | (url & 0xFFFFFFFFL);

View File

@ -43,11 +43,11 @@ public class WordsTableWriter {
var writer = new BTreeWriter(mmf, wordsBTreeContext);
writer.write(offset, tableSize, this::writeBTreeBlock);
writer.write(offset, tableSize, this::writeBTreeDataBlock);
}
}
private void writeBTreeBlock(MultimapFileLongSlice mapSlice) {
private void writeBTreeDataBlock(MultimapFileLongSlice mapSlice) {
long urlFileOffset = 0;
int idx = 0;

View File

@ -27,4 +27,15 @@ public class EdgeCrawlPlan {
}
}
public Path getCrawledFilePath(String fileName) {
String sp1 = fileName.substring(0, 2);
String sp2 = fileName.substring(2, 4);
return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
}
public Path getProcessedFilePath(String fileName) {
String sp1 = fileName.substring(0, 2);
String sp2 = fileName.substring(2, 4);
return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
}
}

View File

@ -0,0 +1,56 @@
package nu.marginalia.util;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class DenseBitMapTest {
@Test
public void testSetAll() {
var dbm = new DenseBitMap(129);
for (int i = 0; i < dbm.cardinality; i++) {
dbm.set(i);
}
for (int i = 0; i < dbm.cardinality; i++) {
assertTrue(dbm.get(i));
}
}
@Test
public void testSetEven() {
var dbm = new DenseBitMap(131);
for (int i = 0; i < dbm.cardinality; i+=2) {
dbm.set(i);
}
for (int i = 0; i < dbm.cardinality; i+=2) {
assertTrue(dbm.get(i));
}
for (int i = 1; i < dbm.cardinality; i+=2) {
assertFalse(dbm.get(i));
}
}
@Test
public void testSetAllClearSome() {
var dbm = new DenseBitMap(129);
for (int i = 0; i < dbm.cardinality; i++) {
dbm.set(i);
}
for (int i = 1; i < dbm.cardinality; i+=2) {
dbm.clear(i);
}
for (int i = 0; i < dbm.cardinality; i+=2) {
assertTrue(dbm.get(i), "Expected " + i + " to be set");
}
for (int i = 1; i < dbm.cardinality; i+=2) {
assertFalse(dbm.get(i), "Expected " + i + " to be clear");
}
}
}