Merge pull request 'Preparations for anchor tag inclusion' (#70) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/70
2025-02-23 13:09:00 +00:00 · 2022-08-01 20:30:08 +02:00 · 2022-08-01 20:30:08 +02:00 · 914badd777
commit 914badd777
parent 76707b8cb0 9f55ad3f34
5 changed files with 284 additions and 31 deletions
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java
@ -0,0 +1,115 @@
+package nu.marginalia.wmsa.edge.converting;
+
+import nu.marginalia.wmsa.configuration.module.DatabaseModule;
+import nu.marginalia.wmsa.configuration.server.Context;
+import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;
+import nu.marginalia.wmsa.edge.model.EdgeId;
+import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
+import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.*;
+import java.util.function.Consumer;
+
+public class LinkKeywordLoaderMain {
+
+    public static void main(String... args) {
+
+        Map<String, Long> urlToId = getUrls();
+        try (EdgeIndexClient indexClient = new EdgeIndexClient();
+             var lines = Files.lines(Path.of(args[0]))
+        ) {
+            lines
+                    .map(UrlKeyword::parseLine)
+                    .filter(Objects::nonNull)
+                    .forEach(new Uploader(urlToId, indexClient));
+
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private record UrlKeyword(String url, String keyword) {
+        public static UrlKeyword parseLine(String line) {
+            String[] parts = line.split("\t");
+            if (parts.length == 2) {
+                return new UrlKeyword(parts[0], parts[1]);
+            }
+            return null;
+        }
+    }
+
+    private static class Uploader implements Consumer<UrlKeyword> {
+        private Map<String, Long> urlToId;
+        private final EdgeIndexClient indexClient;
+
+        private Uploader(Map<String, Long> urlToId,
+                         EdgeIndexClient indexClient) {
+            this.urlToId = urlToId;
+            this.indexClient = indexClient;
+        }
+
+        String lastLine = null;
+        Set<String> keywords = new HashSet<>(100);
+
+        @Override
+        public void accept(UrlKeyword urlKeyword) {
+            if (urlKeyword == null) return;
+
+            if (lastLine == null) {
+                lastLine = urlKeyword.url;
+                keywords.add(urlKeyword.keyword);
+            }
+            else if (urlKeyword.url.equals(lastLine)) {
+                keywords.add(urlKeyword.keyword);
+            }
+            else {
+                Long id = urlToId.get(lastLine);
+
+                if (id != null) {
+                    int urlId = (int)(id & 0xFFFF_FFFFL);
+                    int domainId = (int)(id >>> 32L);
+
+//                    System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
+
+                    indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
+                            new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
+                    ).blockingSubscribe();
+                }
+
+                lastLine = urlKeyword.url;
+                keywords.clear();
+                keywords.add(urlKeyword.keyword);
+            }
+        }
+    }
+
+    private static Map<String, Long>  getUrls() {
+
+        Map<String, Long> urls = new HashMap<>(100_000);
+
+        try (var ds = new DatabaseModule().provideConnection();
+             var conn = ds.getConnection();
+             var stmt = conn.createStatement())
+        {
+            var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL");
+
+            while (rsp.next()) {
+                long val = rsp.getInt(3);
+                val = (val << 32L) | rsp.getInt(2);
+
+                urls.put(rsp.getString(1), val);
+            }
+
+        }
+        catch (SQLException ex) {
+            throw new RuntimeException(ex);
+        }
+
+        return urls;
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
@ -5,13 +5,19 @@ import com.google.common.hash.Hashing;
 import lombok.SneakyThrows;
 import nu.marginalia.util.DenseBitMap;
 import nu.marginalia.util.language.WordPatterns;
+import nu.marginalia.wmsa.configuration.WmsaHome;
+import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
 import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import org.apache.logging.log4j.util.Strings;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;

 import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.Set;
 import java.util.function.BiConsumer;
 import java.util.function.Predicate;
 import java.util.regex.Pattern;
@ -30,12 +36,16 @@ public class AnchorTextExtractor {
    // de-duplicating billions of shuffled (url, word) tuples on limited hardware
    private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);

+    private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels());
+
    public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
                               Predicate<EdgeUrl> includeUrlPredicate,
                               BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
        this.includeDomainPredicate = includeDomainPredicate;
        this.includeUrlPredicate = includeUrlPredicate;
        this.linkKeywordConsumer = linkKeywordConsumer;
+
+
    }

    @SneakyThrows
@ -70,7 +80,11 @@ public class AnchorTextExtractor {
        return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
    }

+    Set<String> excludedTerminators = Set.of("a", "for", "of", "in", "with", "but", "as", "by", "on", "to", "at", "-");
+
    private void processAnchor(EdgeUrl documentUrl, String href, String text) {
+        text = trimText(text);
+
        if (!isInterestingAnchorText(text)) {
            return;
        }
@ -84,25 +98,122 @@ public class AnchorTextExtractor {
            return;
        }

-        for (String word: anchorTextNoise.split(text)) {
-            if (WordPatterns.isStopWord(word))
-                continue;
+        if (Objects.equals(domainHash(linkUrl), domainHash(documentUrl))) {
+            return;
+        }

-            word = word.toLowerCase();
-            if (!WordPatterns.filter(word)) {
-                continue;
+        String[] wordParts = anchorTextNoise.split(text.toLowerCase());
+
+        if (wordParts.length > 1) {
+            String word = Strings.join(Arrays.asList(wordParts), '_');
+
+            addKeywordIfExistsInTermFreqDictionary(linkUrl, word);
+
+            if (word.contains(".")) {
+                addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word));
            }

-            if (linkUrl.domain.equals(documentUrl.domain)) {
-                continue;
+            if (wordParts.length > 2) {
+                for (int i = 1; i < wordParts.length; i++) {
+                    if (excludedTerminators.contains(wordParts[i])) continue;
+                    if (excludedTerminators.contains(wordParts[i-1])) continue;
+
+                    word = wordParts[i-1] + "_" + wordParts[i];
+                    addKeywordIfExistsInTermFreqDictionary(linkUrl, word);
+
+                    if (word.contains(".")) {
+                        addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word));
+                    }
+                }
            }

+            if (wordParts.length > 3) {
+                for (int i = 2; i < wordParts.length; i++) {
+                    if (excludedTerminators.contains(wordParts[i])) continue;
+                    if (excludedTerminators.contains(wordParts[i-2])) continue;
+
+                    word = wordParts[i-2] + "_" + wordParts[i-1] + "_" + wordParts[i];
+
+                    addKeywordIfExistsInTermFreqDictionary(linkUrl, word);
+
+                    if (word.contains(".")) {
+                        word = removePeriods(word);
+                        addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word));
+                    }
+                }
+            }
+
+        }
+
+        for (String word: wordParts) {
+            if (!WordPatterns.isStopWord(word)
+                && WordPatterns.filter(word)
+                && isNewKeywordForLink(word, linkUrl.toString())
+            ) {
+                linkKeywordConsumer.accept(linkUrl, word);
+            }
+        }
+
+        for (String word: wordParts) {
+            if (word.length() > 2 && word.endsWith("'s")) {
+                word = word.substring(0, word.length()-2);
+            }
+
+            if (!WordPatterns.isStopWord(word)
+                    && WordPatterns.filter(word)
+                    && isNewKeywordForLink(word, linkUrl.toString())
+            ) {
+                linkKeywordConsumer.accept(linkUrl, word);
+            }
+        }
+    }
+
+    private void addKeywordIfExistsInTermFreqDictionary(EdgeUrl linkUrl, String word) {
+        if (ngramDict.getTermFreq(word) > 0) {
            if (isNewKeywordForLink(word, linkUrl.toString())) {
                linkKeywordConsumer.accept(linkUrl, word);
            }
        }
    }

+    Pattern p = Pattern.compile("\\.");
+    private String removePeriods(String s) {
+        return p.matcher(s).replaceAll("");
+    }
+
+    private String domainHash(EdgeUrl url) {
+        var domain = url.domain;
+        if ("www".equals(domain.subDomain)) {
+            return domain.domain;
+        }
+        return domain.toString();
+    }
+
+    private String trimText(String text) {
+        int start = text.length()-1;
+        int end = 0;
+
+        for (int i = text.length(); i > 0; i--) {
+            if (Character.isLetterOrDigit(text.charAt(i-1))) {
+                end = i;
+                break;
+            }
+        }
+
+        for (int i = 0; i < end; i++) {
+            if (Character.isLetterOrDigit(text.charAt(i))) {
+                start = i;
+                break;
+            }
+        }
+
+        if (start >= 0 && start < end) {
+            return text.substring(start, end);
+        }
+
+        return "";
+    }
+
    // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
    private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();

@ -135,7 +246,7 @@ public class AnchorTextExtractor {
        return includeDomainPredicate.test(linkUrl.domain.toString());
    }

-    private boolean isNewKeywordForLink(String href, String text) {
+    private synchronized boolean isNewKeywordForLink(String href, String text) {
        long hash = 0;

        hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong();
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
@ -19,7 +19,6 @@ import org.slf4j.LoggerFactory;

 import java.sql.SQLException;
 import java.util.*;
-import java.util.stream.Collectors;


 public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@ -266,18 +265,26 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
    }

    @Override
-    public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId) {
-        if (urlId.isEmpty())
+    public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlIds) {
+        if (urlIds.isEmpty())
            return Collections.emptyList();

-        List<BrowseResult> ret = new ArrayList<>(urlId.size());
+        List<BrowseResult> ret = new ArrayList<>(urlIds.size());

        try (var conn = dataSource.getConnection()) {
            try (var stmt = conn.createStatement()) {
-                // this is safe, string concatenation is of integers
-                String inStmt = urlId.stream().map(id -> Integer.toString(id.id())).collect(Collectors.joining(", ", "(", ")"));

-                var rsp = stmt.executeQuery("SELECT DOMAIN_ID, DOMAIN_NAME FROM EC_URL_VIEW INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID WHERE KNOWN_URLS<5000 AND QUALITY>-10 AND EC_URL_VIEW.ID IN " + inStmt);
+                String inStmt = idList(urlIds);
+
+                var rsp = stmt.executeQuery("""
+                    SELECT DOMAIN_ID, DOMAIN_NAME
+                    FROM EC_URL_VIEW 
+                    INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID 
+                    WHERE 
+                        KNOWN_URLS<5000 
+                    AND QUALITY>-10 
+                    AND EC_URL_VIEW.ID IN 
+                    """ + inStmt); // this injection is safe, inStmt is derived from concatenating a list of integers
                while (rsp.next()) {
                    int id = rsp.getInt(1);
                    String domain = rsp.getString(2);
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java
@ -57,18 +57,15 @@ public class SearchIndexReader implements AutoCloseable {
        queryBuilders = new EnumMap<>(IndexBlock.class);
        underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);

-        queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex, wordsIndex), wordsIndex));
-        queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex), wordsIndex));
-        queryBuilders.put(IndexBlock.Middle, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex), wordsIndex));
+        queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex));
+        queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex));
        queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
-        queryBuilders.put(IndexBlock.PositionWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex, positionIndex), wordsIndex));
-        queryBuilders.put(IndexBlock.NamesWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex), wordsIndex));
-        queryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, linkIndex), wordsIndex));
        queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
        queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));

        underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
-        underspecifiedQueryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
+        underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
+        underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
    }

    @SafeVarargs
@ -121,7 +118,12 @@ public class SearchIndexReader implements AutoCloseable {
    }

    public Query findWord(IndexBlock block, IndexSearchBudget budget, LongPredicate filter, int wordId) {
-        return queryBuilders.get(block).build(budget, filter, wordId);
+        var builder = queryBuilders.get(block);
+
+        if (builder == null)
+            return Query.EMPTY;
+
+        return builder.build(budget, filter, wordId);
    }

    @Override
@ -135,13 +137,20 @@ public class SearchIndexReader implements AutoCloseable {

    @SneakyThrows
    public long numHits(IndexBlock block, int word) {
-        return numHitsCache.get(Pair.of(block, word),
-                () -> queryBuilders.get(block)
-                        .getIndicies()
-                        .stream()
-                        .mapToLong(idx -> idx.numUrls(word))
-                        .sum()
-        );
+        return numHitsCache.get(Pair.of(block, word), () -> numHitsForBlockWord(block, word));
+    }
+
+    private long numHitsForBlockWord(IndexBlock block, int word) {
+        IndexQueryBuilder builder = queryBuilders.get(block);
+
+        if (builder == null)
+            return 0L;
+
+        return builder
+                .getIndicies()
+                .stream()
+                .mapToLong(idx -> idx.numUrls(word))
+                .sum();
    }

    public IndexBlock getBlockForResult(int searchTerm, long urlId) {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java
@ -3,6 +3,17 @@ package nu.marginalia.wmsa.edge.index.reader.query;
 import java.util.stream.LongStream;

 public interface Query {
+    Query EMPTY = new Query() {
+        @Override
+        public Query also(int wordId) { return this; }
+
+        @Override
+        public Query not(int wordId) { return this; }
+
+        @Override
+        public LongStream stream() { return LongStream.empty(); }
+    };
+
    Query also(int wordId);
    Query not(int wordId);