diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java new file mode 100644 index 00000000..39b34048 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java @@ -0,0 +1,37 @@ +package nu.marginalia.util; + +import java.nio.ByteBuffer; + +public class DenseBitMap { + public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8; + + public final long cardinality; + private final ByteBuffer buffer; + + public DenseBitMap(long cardinality) { + this.cardinality = cardinality; + + boolean misaligned = (cardinality & 7) > 0; + this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0))); + } + + public boolean get(long pos) { + return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0; + } + + /** Set the bit indexed by pos, returns + * its previous value. + */ + public boolean set(long pos) { + int offset = (int) (pos >>> 3); + int oldVal = buffer.get(offset); + int mask = (byte) 1 << (int) (pos & 7); + buffer.put(offset, (byte) (oldVal | mask)); + return (oldVal & mask) != 0; + } + + public void clear(long pos) { + int offset = (int)(pos >>> 3); + buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7)))); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java index 0c274c2b..ada8de71 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java @@ -125,7 +125,7 @@ public class RandomWriteFunnel implements AutoCloseable { dest.putLong(addr, data); } catch (IndexOutOfBoundsException ex) { - logger.info("!!!bad[{}]={}", addr, data); + logger.info("Bad poke[{}]={}, this happens if an RWF is allocated with insufficient size", addr, data); } } buffer.compact(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java index 5d86c4d2..388eb175 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java @@ -5,8 +5,6 @@ import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapSearcher; -import javax.annotation.CheckReturnValue; - import static java.lang.Math.min; public class BTreeReader { @@ -68,7 +66,7 @@ public class BTreeReader { for (int i = header.layers() - 1; i >= 0; --i) { final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset; - final long nextLayerOffset = indexSearch(key, indexAddress + indexLayerBlockOffset, blockSize); + final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize); if (nextLayerOffset < 0) return nextLayerOffset; @@ -78,7 +76,7 @@ public class BTreeReader { return layerOffset; } - private long indexSearch(long key, long start, long n) { + private long relativePositionInIndex(long key, long start, long n) { return indexSearcher.binarySearchUpper(key, start, n) - start; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java index b4406954..5630939f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java @@ -2,15 +2,17 @@ package nu.marginalia.util.language.processing.model; import nu.marginalia.util.language.WordPatterns; +import org.jetbrains.annotations.NotNull; import java.lang.ref.SoftReference; import java.util.BitSet; +import java.util.Iterator; import java.util.StringJoiner; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; -public class DocumentSentence { +public class DocumentSentence implements Iterable{ public final String originalSentence; public final String[] words; public final int[] separators; @@ -85,4 +87,37 @@ public class DocumentSentence { public String toString() { return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" ")); } + + @NotNull + @Override + public Iterator iterator() { + return new Iterator<>() { + int i = -1; + @Override + public boolean hasNext() { + return i+1 < length(); + } + + @Override + public SentencePos next() { + return new SentencePos(++i); + } + }; + } + + public class SentencePos { + public final int pos; + + public SentencePos(int pos) { + this.pos = pos; + } + + public String word() { return words[pos]; } + public String wordLowerCase() { return wordsLowerCase[pos]; } + public String posTag() { return posTags[pos]; } + public String stemmed() { return stemmedWords[pos]; } + public int separator() { return separators[pos]; } + public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); } + } } + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java index 5d6f2762..61ff0b00 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java @@ -52,13 +52,6 @@ public class ConverterMain { injector.getInstance(ConverterMain.class); } - private static void requireArgs(String[] args, String... help) { - if (args.length != help.length) { - System.out.println("Usage: " + String.join(", ", help)); - System.exit(255); - } - } - @Inject public ConverterMain( EdgeCrawlPlan plan, @@ -103,7 +96,8 @@ public class ConverterMain { domainToId.forEach((domain, id) -> { String fileName = idToFileName.get(id); - Path dest = getFilePath(plan.crawl.getDir(), fileName); + Path dest = plan.getCrawledFilePath(fileName); + logger.info("{} - {} - {}", domain, id, dest); if (!processLog.isJobFinished(id)) { @@ -128,10 +122,4 @@ public class ConverterMain { record ProcessingInstructions(String id, List instructions) {} - private Path getFilePath(Path dir, String fileName) { - String sp1 = fileName.substring(0, 2); - String sp2 = fileName.substring(2, 4); - return dir.resolve(sp1).resolve(sp2).resolve(fileName); - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java new file mode 100644 index 00000000..63c26200 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -0,0 +1,194 @@ +package nu.marginalia.wmsa.edge.converting; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.util.DenseBitMap; +import nu.marginalia.util.language.WordPatterns; +import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; +import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; +import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; +import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; +import nu.marginalia.wmsa.edge.crawling.WorkLog; +import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import org.jsoup.Jsoup; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class LinkKeywordExtractorMain { + private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class); + + public static void main(String... args) throws IOException { + + if (args.length != 1) { + System.err.println("Arguments: crawl-plan.yaml"); + System.exit(0); + } + var plan = new CrawlPlanLoader().load(Path.of(args[0])); + + Injector injector = Guice.createInjector( + new ConverterModule(plan) + ); + + injector.getInstance(LinkKeywordExtractorMain.class); + } + + private final HashSet crawledDomains = new HashSet<>(); + private final List fileNames = new ArrayList<>(); + private final LinkParser linkParser = new LinkParser(); + private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); + + private final HashFunction hashFunction = Hashing.murmur3_128(); + + // This bit map is used as a bloom filter to deduplicate url-keyword combinations + // false positives are expected, but that's an acceptable trade-off to not have to deal with + // de-duplicating billions of shuffled (url, word) tuples on limited hardware + private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); + + @Inject + public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException { + logger.info("Loading input spec"); + + CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), + spec -> crawledDomains.add(spec.domain)); + + logger.info("Replaying crawl log"); + WorkLog.readLog(plan.crawl.getLogFile(), + entry -> fileNames.add(entry.path())); + + logger.info("Reading files"); + for (var fn : fileNames) { + CrawledDomainReader crawledDomainReader = new CrawledDomainReader(); + var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn)); + if (crawledDomain.doc == null) continue; + + System.out.println("# " + crawledDomain.domain); + + for (var doc : crawledDomain.doc) { + try { + if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) { + processDocument(doc.url, doc.documentBody); + } + } + catch (URISyntaxException ex) { + // This Shouldn't Happen (TM) as the URL that we're failing to process + // is expected to have already been parsed by this code successfully + // in the process of getting here. + // + // But also, if it does happen, it's no big deal + + logger.warn("Bad URL format", ex); + } + } + } + } + + private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+"); + + private void processDocument(String docUrl, String documentBody) throws URISyntaxException { + var processed = Jsoup.parse(documentBody); + + EdgeUrl documentUrl = new EdgeUrl(docUrl); + + for (var link : processed.getElementsByTag("a")) { + if (link.hasAttr("href")) { + String href = link.attr("href"); + String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim(); + + processAnchor(documentUrl, href, text); + } + } + } + + private void processAnchor(EdgeUrl documentUrl, String href, String text) { + if (!isInterestingAnchorText(text)) { + return; + } + + var optLinkUrl = linkParser.parseLink(documentUrl, href); + if (optLinkUrl.isEmpty()) return; + + var linkUrl = optLinkUrl.get(); + + if (!isInterestingAnchorLink(linkUrl)) { + return; + } + + DocumentLanguageData languageData = sentenceExtractor.extractSentences(text); + for (var sent : languageData.sentences) { + for (var wordPos : sent) { + if (wordPos.isStopWord()) + continue; + + String word = wordPos.wordLowerCase(); + if (!WordPatterns.wordQualitiesPredicate.test(word) || !WordPatterns.filter(word)) + continue; + + + if (!linkUrl.domain.equals(documentUrl.domain)) { + if (isNewKeywordForLink(word, linkUrl.toString())) { + System.out.println(linkUrl + "\t" + word); + } + } + } + } + } + + // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine + private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); + + private boolean isInterestingAnchorText(String text) { + if (text.isBlank()) return false; + if (text.length() > 32) return false; + + // Google loves questions, and so does SEO spammers + if (text.endsWith("?")) return false; + + if (text.startsWith("http:") || text.startsWith("https:")) return false; + + if (looksLikeAnURL.test(text)) return false; + + return switch (text) { + case "this", "here", "click", "click here", "download", "source" -> false; + default -> true; + }; + } + + private boolean isInterestingAnchorLink(EdgeUrl linkUrl) { + if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) { + return false; + } + + return crawledDomains.contains(linkUrl.domain.toString()); + } + + private boolean isNewKeywordForLink(String href, String text) { + long hash = 0; + + hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong(); + hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong(); + + // Remove sign bit because we don't want a negative index in deduplicateHashBitset + hash &= 0x7FFF_FFFF_FFFF_FFFFL; + + return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index 378182f2..0a2bdf45 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -145,13 +145,8 @@ public class LinkParser { } private boolean isRelRelevant(String rel) { - if (null == rel) { - return true; - } - return switch (rel) { - case "noindex" -> false; - default -> true; - }; + // this is null safe + return !"noindex".equalsIgnoreCase(rel); } private boolean isUrlRelevant(String href) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java index 2d12d0f4..afa319f4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java @@ -91,18 +91,13 @@ public class SearchIndexConverter { } } - - private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader, File outputFileWords) throws IOException { final int topWord = (int) journalReader.fileHeader.wordCount(); - logger.debug("Table size = {}", topWord); WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord); - logger.debug("Reading words"); - for (var entry : journalReader) { if (!isRelevantEntry(entry)) { continue; @@ -119,8 +114,6 @@ public class SearchIndexConverter { } } - logger.debug("Rearranging table"); - wordsTableWriter.write(outputFileWords); return wordsTableWriter.getTable(); @@ -130,15 +123,12 @@ public class SearchIndexConverter { Path tmpUrlsFile, WordIndexOffsetsTable wordOffsetsTable) throws IOException { - logger.info("Table size = {}", wordOffsetsTable.length()); - long numberOfWordsTotal = 0; for (var entry : journalReader) { if (isRelevantEntry(entry)) numberOfWordsTotal += entry.wordCount(); } - try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) { @@ -168,7 +158,6 @@ public class SearchIndexConverter { } } - rwf.write(urlsTmpFileChannel); } @@ -176,8 +165,6 @@ public class SearchIndexConverter { try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) { if (wordOffsetsTable.length() > 0) { - logger.info("Sorting urls table"); - var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); wordOffsetsTable.forEachRange(urlTmpFileSorter::sort); @@ -188,7 +175,6 @@ public class SearchIndexConverter { } } - logger.info("Writing BTree"); try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) { var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); @@ -206,7 +192,6 @@ public class SearchIndexConverter { } } - private long translateUrl(long url) { int domainId = partitioner.translateId(bucketId, (int) (url >>> 32)); return ((long)domainId << 32) | (url & 0xFFFFFFFFL); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java index 7f762ff3..15ad0cd3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java @@ -43,11 +43,11 @@ public class WordsTableWriter { var writer = new BTreeWriter(mmf, wordsBTreeContext); - writer.write(offset, tableSize, this::writeBTreeBlock); + writer.write(offset, tableSize, this::writeBTreeDataBlock); } } - private void writeBTreeBlock(MultimapFileLongSlice mapSlice) { + private void writeBTreeDataBlock(MultimapFileLongSlice mapSlice) { long urlFileOffset = 0; int idx = 0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java index 4e237908..264c1051 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java @@ -27,4 +27,15 @@ public class EdgeCrawlPlan { } } + public Path getCrawledFilePath(String fileName) { + String sp1 = fileName.substring(0, 2); + String sp2 = fileName.substring(2, 4); + return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName); + } + + public Path getProcessedFilePath(String fileName) { + String sp1 = fileName.substring(0, 2); + String sp2 = fileName.substring(2, 4); + return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName); + } } diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java new file mode 100644 index 00000000..20857947 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java @@ -0,0 +1,56 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class DenseBitMapTest { + + @Test + public void testSetAll() { + var dbm = new DenseBitMap(129); + for (int i = 0; i < dbm.cardinality; i++) { + dbm.set(i); + } + + for (int i = 0; i < dbm.cardinality; i++) { + assertTrue(dbm.get(i)); + } + } + + @Test + public void testSetEven() { + var dbm = new DenseBitMap(131); + for (int i = 0; i < dbm.cardinality; i+=2) { + dbm.set(i); + } + + for (int i = 0; i < dbm.cardinality; i+=2) { + assertTrue(dbm.get(i)); + } + + for (int i = 1; i < dbm.cardinality; i+=2) { + assertFalse(dbm.get(i)); + } + } + + @Test + public void testSetAllClearSome() { + var dbm = new DenseBitMap(129); + + for (int i = 0; i < dbm.cardinality; i++) { + dbm.set(i); + } + for (int i = 1; i < dbm.cardinality; i+=2) { + dbm.clear(i); + } + + for (int i = 0; i < dbm.cardinality; i+=2) { + assertTrue(dbm.get(i), "Expected " + i + " to be set"); + } + + for (int i = 1; i < dbm.cardinality; i+=2) { + assertFalse(dbm.get(i), "Expected " + i + " to be clear"); + } + } +} \ No newline at end of file