diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java new file mode 100644 index 00000000..b68ee68c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java @@ -0,0 +1,115 @@ +package nu.marginalia.wmsa.edge.converting; + +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.*; +import java.util.function.Consumer; + +public class LinkKeywordLoaderMain { + + public static void main(String... args) { + + Map urlToId = getUrls(); + try (EdgeIndexClient indexClient = new EdgeIndexClient(); + var lines = Files.lines(Path.of(args[0])) + ) { + lines + .map(UrlKeyword::parseLine) + .filter(Objects::nonNull) + .forEach(new Uploader(urlToId, indexClient)); + + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private record UrlKeyword(String url, String keyword) { + public static UrlKeyword parseLine(String line) { + String[] parts = line.split("\t"); + if (parts.length == 2) { + return new UrlKeyword(parts[0], parts[1]); + } + return null; + } + } + + private static class Uploader implements Consumer { + private Map urlToId; + private final EdgeIndexClient indexClient; + + private Uploader(Map urlToId, + EdgeIndexClient indexClient) { + this.urlToId = urlToId; + this.indexClient = indexClient; + } + + String lastLine = null; + Set keywords = new HashSet<>(100); + + @Override + public void accept(UrlKeyword urlKeyword) { + if (urlKeyword == null) return; + + if (lastLine == null) { + lastLine = urlKeyword.url; + keywords.add(urlKeyword.keyword); + } + else if (urlKeyword.url.equals(lastLine)) { + keywords.add(urlKeyword.keyword); + } + else { + Long id = urlToId.get(lastLine); + + if (id != null) { + int urlId = (int)(id & 0xFFFF_FFFFL); + int domainId = (int)(id >>> 32L); + +// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords); + + indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet( + new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0 + ).blockingSubscribe(); + } + + lastLine = urlKeyword.url; + keywords.clear(); + keywords.add(urlKeyword.keyword); + } + } + } + + private static Map getUrls() { + + Map urls = new HashMap<>(100_000); + + try (var ds = new DatabaseModule().provideConnection(); + var conn = ds.getConnection(); + var stmt = conn.createStatement()) + { + var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL"); + + while (rsp.next()) { + long val = rsp.getInt(3); + val = (val << 32L) | rsp.getInt(2); + + urls.put(rsp.getString(1), val); + } + + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return urls; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java index 8c5fc6c1..6d4927fb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java @@ -5,13 +5,19 @@ import com.google.common.hash.Hashing; import lombok.SneakyThrows; import nu.marginalia.util.DenseBitMap; import nu.marginalia.util.language.WordPatterns; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.model.EdgeUrl; +import org.apache.logging.log4j.util.Strings; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Objects; +import java.util.Set; import java.util.function.BiConsumer; import java.util.function.Predicate; import java.util.regex.Pattern; @@ -30,12 +36,16 @@ public class AnchorTextExtractor { // de-duplicating billions of shuffled (url, word) tuples on limited hardware private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); + private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels()); + public AnchorTextExtractor(Predicate includeDomainPredicate, Predicate includeUrlPredicate, BiConsumer linkKeywordConsumer) { this.includeDomainPredicate = includeDomainPredicate; this.includeUrlPredicate = includeUrlPredicate; this.linkKeywordConsumer = linkKeywordConsumer; + + } @SneakyThrows @@ -70,7 +80,11 @@ public class AnchorTextExtractor { return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim(); } + Set excludedTerminators = Set.of("a", "for", "of", "in", "with", "but", "as", "by", "on", "to", "at", "-"); + private void processAnchor(EdgeUrl documentUrl, String href, String text) { + text = trimText(text); + if (!isInterestingAnchorText(text)) { return; } @@ -84,25 +98,122 @@ public class AnchorTextExtractor { return; } - for (String word: anchorTextNoise.split(text)) { - if (WordPatterns.isStopWord(word)) - continue; + if (Objects.equals(domainHash(linkUrl), domainHash(documentUrl))) { + return; + } - word = word.toLowerCase(); - if (!WordPatterns.filter(word)) { - continue; + String[] wordParts = anchorTextNoise.split(text.toLowerCase()); + + if (wordParts.length > 1) { + String word = Strings.join(Arrays.asList(wordParts), '_'); + + addKeywordIfExistsInTermFreqDictionary(linkUrl, word); + + if (word.contains(".")) { + addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word)); } - if (linkUrl.domain.equals(documentUrl.domain)) { - continue; + if (wordParts.length > 2) { + for (int i = 1; i < wordParts.length; i++) { + if (excludedTerminators.contains(wordParts[i])) continue; + if (excludedTerminators.contains(wordParts[i-1])) continue; + + word = wordParts[i-1] + "_" + wordParts[i]; + addKeywordIfExistsInTermFreqDictionary(linkUrl, word); + + if (word.contains(".")) { + addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word)); + } + } } + if (wordParts.length > 3) { + for (int i = 2; i < wordParts.length; i++) { + if (excludedTerminators.contains(wordParts[i])) continue; + if (excludedTerminators.contains(wordParts[i-2])) continue; + + word = wordParts[i-2] + "_" + wordParts[i-1] + "_" + wordParts[i]; + + addKeywordIfExistsInTermFreqDictionary(linkUrl, word); + + if (word.contains(".")) { + word = removePeriods(word); + addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word)); + } + } + } + + } + + for (String word: wordParts) { + if (!WordPatterns.isStopWord(word) + && WordPatterns.filter(word) + && isNewKeywordForLink(word, linkUrl.toString()) + ) { + linkKeywordConsumer.accept(linkUrl, word); + } + } + + for (String word: wordParts) { + if (word.length() > 2 && word.endsWith("'s")) { + word = word.substring(0, word.length()-2); + } + + if (!WordPatterns.isStopWord(word) + && WordPatterns.filter(word) + && isNewKeywordForLink(word, linkUrl.toString()) + ) { + linkKeywordConsumer.accept(linkUrl, word); + } + } + } + + private void addKeywordIfExistsInTermFreqDictionary(EdgeUrl linkUrl, String word) { + if (ngramDict.getTermFreq(word) > 0) { if (isNewKeywordForLink(word, linkUrl.toString())) { linkKeywordConsumer.accept(linkUrl, word); } } } + Pattern p = Pattern.compile("\\."); + private String removePeriods(String s) { + return p.matcher(s).replaceAll(""); + } + + private String domainHash(EdgeUrl url) { + var domain = url.domain; + if ("www".equals(domain.subDomain)) { + return domain.domain; + } + return domain.toString(); + } + + private String trimText(String text) { + int start = text.length()-1; + int end = 0; + + for (int i = text.length(); i > 0; i--) { + if (Character.isLetterOrDigit(text.charAt(i-1))) { + end = i; + break; + } + } + + for (int i = 0; i < end; i++) { + if (Character.isLetterOrDigit(text.charAt(i))) { + start = i; + break; + } + } + + if (start >= 0 && start < end) { + return text.substring(start, end); + } + + return ""; + } + // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); @@ -135,7 +246,7 @@ public class AnchorTextExtractor { return includeDomainPredicate.test(linkUrl.domain.toString()); } - private boolean isNewKeywordForLink(String href, String text) { + private synchronized boolean isNewKeywordForLink(String href, String text) { long hash = 0; hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index a7e900b4..6bc50632 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -19,7 +19,6 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; -import java.util.stream.Collectors; public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { @@ -266,18 +265,26 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } @Override - public List getBrowseResultFromUrlIds(List> urlId) { - if (urlId.isEmpty()) + public List getBrowseResultFromUrlIds(List> urlIds) { + if (urlIds.isEmpty()) return Collections.emptyList(); - List ret = new ArrayList<>(urlId.size()); + List ret = new ArrayList<>(urlIds.size()); try (var conn = dataSource.getConnection()) { try (var stmt = conn.createStatement()) { - // this is safe, string concatenation is of integers - String inStmt = urlId.stream().map(id -> Integer.toString(id.id())).collect(Collectors.joining(", ", "(", ")")); - var rsp = stmt.executeQuery("SELECT DOMAIN_ID, DOMAIN_NAME FROM EC_URL_VIEW INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID WHERE KNOWN_URLS<5000 AND QUALITY>-10 AND EC_URL_VIEW.ID IN " + inStmt); + String inStmt = idList(urlIds); + + var rsp = stmt.executeQuery(""" + SELECT DOMAIN_ID, DOMAIN_NAME + FROM EC_URL_VIEW + INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID + WHERE + KNOWN_URLS<5000 + AND QUALITY>-10 + AND EC_URL_VIEW.ID IN + """ + inStmt); // this injection is safe, inStmt is derived from concatenating a list of integers while (rsp.next()) { int id = rsp.getInt(1); String domain = rsp.getString(2); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index e4061982..74bc8e18 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -57,18 +57,15 @@ public class SearchIndexReader implements AutoCloseable { queryBuilders = new EnumMap<>(IndexBlock.class); underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class); - queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex, wordsIndex), wordsIndex)); - queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex), wordsIndex)); - queryBuilders.put(IndexBlock.Middle, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex), wordsIndex)); + queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex)); + queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex)); queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex)); - queryBuilders.put(IndexBlock.PositionWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex, positionIndex), wordsIndex)); - queryBuilders.put(IndexBlock.NamesWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex), wordsIndex)); - queryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, linkIndex), wordsIndex)); queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex)); queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex)); underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); - underspecifiedQueryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); + underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); + underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); } @SafeVarargs @@ -121,7 +118,12 @@ public class SearchIndexReader implements AutoCloseable { } public Query findWord(IndexBlock block, IndexSearchBudget budget, LongPredicate filter, int wordId) { - return queryBuilders.get(block).build(budget, filter, wordId); + var builder = queryBuilders.get(block); + + if (builder == null) + return Query.EMPTY; + + return builder.build(budget, filter, wordId); } @Override @@ -135,13 +137,20 @@ public class SearchIndexReader implements AutoCloseable { @SneakyThrows public long numHits(IndexBlock block, int word) { - return numHitsCache.get(Pair.of(block, word), - () -> queryBuilders.get(block) - .getIndicies() - .stream() - .mapToLong(idx -> idx.numUrls(word)) - .sum() - ); + return numHitsCache.get(Pair.of(block, word), () -> numHitsForBlockWord(block, word)); + } + + private long numHitsForBlockWord(IndexBlock block, int word) { + IndexQueryBuilder builder = queryBuilders.get(block); + + if (builder == null) + return 0L; + + return builder + .getIndicies() + .stream() + .mapToLong(idx -> idx.numUrls(word)) + .sum(); } public IndexBlock getBlockForResult(int searchTerm, long urlId) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java index 5f343d54..ac941b83 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java @@ -3,6 +3,17 @@ package nu.marginalia.wmsa.edge.index.reader.query; import java.util.stream.LongStream; public interface Query { + Query EMPTY = new Query() { + @Override + public Query also(int wordId) { return this; } + + @Override + public Query not(int wordId) { return this; } + + @Override + public LongStream stream() { return LongStream.empty(); } + }; + Query also(int wordId); Query not(int wordId);