diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java new file mode 100644 index 00000000..b68ee68c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java @@ -0,0 +1,115 @@ +package nu.marginalia.wmsa.edge.converting; + +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.*; +import java.util.function.Consumer; + +public class LinkKeywordLoaderMain { + + public static void main(String... args) { + + Map urlToId = getUrls(); + try (EdgeIndexClient indexClient = new EdgeIndexClient(); + var lines = Files.lines(Path.of(args[0])) + ) { + lines + .map(UrlKeyword::parseLine) + .filter(Objects::nonNull) + .forEach(new Uploader(urlToId, indexClient)); + + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private record UrlKeyword(String url, String keyword) { + public static UrlKeyword parseLine(String line) { + String[] parts = line.split("\t"); + if (parts.length == 2) { + return new UrlKeyword(parts[0], parts[1]); + } + return null; + } + } + + private static class Uploader implements Consumer { + private Map urlToId; + private final EdgeIndexClient indexClient; + + private Uploader(Map urlToId, + EdgeIndexClient indexClient) { + this.urlToId = urlToId; + this.indexClient = indexClient; + } + + String lastLine = null; + Set keywords = new HashSet<>(100); + + @Override + public void accept(UrlKeyword urlKeyword) { + if (urlKeyword == null) return; + + if (lastLine == null) { + lastLine = urlKeyword.url; + keywords.add(urlKeyword.keyword); + } + else if (urlKeyword.url.equals(lastLine)) { + keywords.add(urlKeyword.keyword); + } + else { + Long id = urlToId.get(lastLine); + + if (id != null) { + int urlId = (int)(id & 0xFFFF_FFFFL); + int domainId = (int)(id >>> 32L); + +// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords); + + indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet( + new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0 + ).blockingSubscribe(); + } + + lastLine = urlKeyword.url; + keywords.clear(); + keywords.add(urlKeyword.keyword); + } + } + } + + private static Map getUrls() { + + Map urls = new HashMap<>(100_000); + + try (var ds = new DatabaseModule().provideConnection(); + var conn = ds.getConnection(); + var stmt = conn.createStatement()) + { + var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL"); + + while (rsp.next()) { + long val = rsp.getInt(3); + val = (val << 32L) | rsp.getInt(2); + + urls.put(rsp.getString(1), val); + } + + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return urls; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java index 8c5fc6c1..6d4927fb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java @@ -5,13 +5,19 @@ import com.google.common.hash.Hashing; import lombok.SneakyThrows; import nu.marginalia.util.DenseBitMap; import nu.marginalia.util.language.WordPatterns; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.model.EdgeUrl; +import org.apache.logging.log4j.util.Strings; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Objects; +import java.util.Set; import java.util.function.BiConsumer; import java.util.function.Predicate; import java.util.regex.Pattern; @@ -30,12 +36,16 @@ public class AnchorTextExtractor { // de-duplicating billions of shuffled (url, word) tuples on limited hardware private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); + private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels()); + public AnchorTextExtractor(Predicate includeDomainPredicate, Predicate includeUrlPredicate, BiConsumer linkKeywordConsumer) { this.includeDomainPredicate = includeDomainPredicate; this.includeUrlPredicate = includeUrlPredicate; this.linkKeywordConsumer = linkKeywordConsumer; + + } @SneakyThrows @@ -70,7 +80,11 @@ public class AnchorTextExtractor { return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim(); } + Set excludedTerminators = Set.of("a", "for", "of", "in", "with", "but", "as", "by", "on", "to", "at", "-"); + private void processAnchor(EdgeUrl documentUrl, String href, String text) { + text = trimText(text); + if (!isInterestingAnchorText(text)) { return; } @@ -84,25 +98,122 @@ public class AnchorTextExtractor { return; } - for (String word: anchorTextNoise.split(text)) { - if (WordPatterns.isStopWord(word)) - continue; + if (Objects.equals(domainHash(linkUrl), domainHash(documentUrl))) { + return; + } - word = word.toLowerCase(); - if (!WordPatterns.filter(word)) { - continue; + String[] wordParts = anchorTextNoise.split(text.toLowerCase()); + + if (wordParts.length > 1) { + String word = Strings.join(Arrays.asList(wordParts), '_'); + + addKeywordIfExistsInTermFreqDictionary(linkUrl, word); + + if (word.contains(".")) { + addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word)); } - if (linkUrl.domain.equals(documentUrl.domain)) { - continue; + if (wordParts.length > 2) { + for (int i = 1; i < wordParts.length; i++) { + if (excludedTerminators.contains(wordParts[i])) continue; + if (excludedTerminators.contains(wordParts[i-1])) continue; + + word = wordParts[i-1] + "_" + wordParts[i]; + addKeywordIfExistsInTermFreqDictionary(linkUrl, word); + + if (word.contains(".")) { + addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word)); + } + } } + if (wordParts.length > 3) { + for (int i = 2; i < wordParts.length; i++) { + if (excludedTerminators.contains(wordParts[i])) continue; + if (excludedTerminators.contains(wordParts[i-2])) continue; + + word = wordParts[i-2] + "_" + wordParts[i-1] + "_" + wordParts[i]; + + addKeywordIfExistsInTermFreqDictionary(linkUrl, word); + + if (word.contains(".")) { + word = removePeriods(word); + addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word)); + } + } + } + + } + + for (String word: wordParts) { + if (!WordPatterns.isStopWord(word) + && WordPatterns.filter(word) + && isNewKeywordForLink(word, linkUrl.toString()) + ) { + linkKeywordConsumer.accept(linkUrl, word); + } + } + + for (String word: wordParts) { + if (word.length() > 2 && word.endsWith("'s")) { + word = word.substring(0, word.length()-2); + } + + if (!WordPatterns.isStopWord(word) + && WordPatterns.filter(word) + && isNewKeywordForLink(word, linkUrl.toString()) + ) { + linkKeywordConsumer.accept(linkUrl, word); + } + } + } + + private void addKeywordIfExistsInTermFreqDictionary(EdgeUrl linkUrl, String word) { + if (ngramDict.getTermFreq(word) > 0) { if (isNewKeywordForLink(word, linkUrl.toString())) { linkKeywordConsumer.accept(linkUrl, word); } } } + Pattern p = Pattern.compile("\\."); + private String removePeriods(String s) { + return p.matcher(s).replaceAll(""); + } + + private String domainHash(EdgeUrl url) { + var domain = url.domain; + if ("www".equals(domain.subDomain)) { + return domain.domain; + } + return domain.toString(); + } + + private String trimText(String text) { + int start = text.length()-1; + int end = 0; + + for (int i = text.length(); i > 0; i--) { + if (Character.isLetterOrDigit(text.charAt(i-1))) { + end = i; + break; + } + } + + for (int i = 0; i < end; i++) { + if (Character.isLetterOrDigit(text.charAt(i))) { + start = i; + break; + } + } + + if (start >= 0 && start < end) { + return text.substring(start, end); + } + + return ""; + } + // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); @@ -135,7 +246,7 @@ public class AnchorTextExtractor { return includeDomainPredicate.test(linkUrl.domain.toString()); } - private boolean isNewKeywordForLink(String href, String text) { + private synchronized boolean isNewKeywordForLink(String href, String text) { long hash = 0; hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong();