A-tags loader

This commit is contained in:
vlofgren 2022-08-01 20:05:55 +02:00
parent 9a6c8339d0
commit 9a4183a481
2 changed files with 235 additions and 9 deletions

View File

@ -0,0 +1,115 @@
package nu.marginalia.wmsa.edge.converting;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*;
import java.util.function.Consumer;
public class LinkKeywordLoaderMain {
public static void main(String... args) {
Map<String, Long> urlToId = getUrls();
try (EdgeIndexClient indexClient = new EdgeIndexClient();
var lines = Files.lines(Path.of(args[0]))
) {
lines
.map(UrlKeyword::parseLine)
.filter(Objects::nonNull)
.forEach(new Uploader(urlToId, indexClient));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private record UrlKeyword(String url, String keyword) {
public static UrlKeyword parseLine(String line) {
String[] parts = line.split("\t");
if (parts.length == 2) {
return new UrlKeyword(parts[0], parts[1]);
}
return null;
}
}
private static class Uploader implements Consumer<UrlKeyword> {
private Map<String, Long> urlToId;
private final EdgeIndexClient indexClient;
private Uploader(Map<String, Long> urlToId,
EdgeIndexClient indexClient) {
this.urlToId = urlToId;
this.indexClient = indexClient;
}
String lastLine = null;
Set<String> keywords = new HashSet<>(100);
@Override
public void accept(UrlKeyword urlKeyword) {
if (urlKeyword == null) return;
if (lastLine == null) {
lastLine = urlKeyword.url;
keywords.add(urlKeyword.keyword);
}
else if (urlKeyword.url.equals(lastLine)) {
keywords.add(urlKeyword.keyword);
}
else {
Long id = urlToId.get(lastLine);
if (id != null) {
int urlId = (int)(id & 0xFFFF_FFFFL);
int domainId = (int)(id >>> 32L);
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
).blockingSubscribe();
}
lastLine = urlKeyword.url;
keywords.clear();
keywords.add(urlKeyword.keyword);
}
}
}
private static Map<String, Long> getUrls() {
Map<String, Long> urls = new HashMap<>(100_000);
try (var ds = new DatabaseModule().provideConnection();
var conn = ds.getConnection();
var stmt = conn.createStatement())
{
var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL");
while (rsp.next()) {
long val = rsp.getInt(3);
val = (val << 32L) | rsp.getInt(2);
urls.put(rsp.getString(1), val);
}
}
catch (SQLException ex) {
throw new RuntimeException(ex);
}
return urls;
}
}

View File

@ -5,13 +5,19 @@ import com.google.common.hash.Hashing;
import lombok.SneakyThrows;
import nu.marginalia.util.DenseBitMap;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.apache.logging.log4j.util.Strings;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Objects;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@ -30,12 +36,16 @@ public class AnchorTextExtractor {
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels());
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
Predicate<EdgeUrl> includeUrlPredicate,
BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
this.includeDomainPredicate = includeDomainPredicate;
this.includeUrlPredicate = includeUrlPredicate;
this.linkKeywordConsumer = linkKeywordConsumer;
}
@SneakyThrows
@ -70,7 +80,11 @@ public class AnchorTextExtractor {
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
}
Set<String> excludedTerminators = Set.of("a", "for", "of", "in", "with", "but", "as", "by", "on", "to", "at", "-");
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
text = trimText(text);
if (!isInterestingAnchorText(text)) {
return;
}
@ -84,25 +98,122 @@ public class AnchorTextExtractor {
return;
}
for (String word: anchorTextNoise.split(text)) {
if (WordPatterns.isStopWord(word))
continue;
if (Objects.equals(domainHash(linkUrl), domainHash(documentUrl))) {
return;
}
word = word.toLowerCase();
if (!WordPatterns.filter(word)) {
continue;
String[] wordParts = anchorTextNoise.split(text.toLowerCase());
if (wordParts.length > 1) {
String word = Strings.join(Arrays.asList(wordParts), '_');
addKeywordIfExistsInTermFreqDictionary(linkUrl, word);
if (word.contains(".")) {
addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word));
}
if (linkUrl.domain.equals(documentUrl.domain)) {
continue;
if (wordParts.length > 2) {
for (int i = 1; i < wordParts.length; i++) {
if (excludedTerminators.contains(wordParts[i])) continue;
if (excludedTerminators.contains(wordParts[i-1])) continue;
word = wordParts[i-1] + "_" + wordParts[i];
addKeywordIfExistsInTermFreqDictionary(linkUrl, word);
if (word.contains(".")) {
addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word));
}
}
}
if (wordParts.length > 3) {
for (int i = 2; i < wordParts.length; i++) {
if (excludedTerminators.contains(wordParts[i])) continue;
if (excludedTerminators.contains(wordParts[i-2])) continue;
word = wordParts[i-2] + "_" + wordParts[i-1] + "_" + wordParts[i];
addKeywordIfExistsInTermFreqDictionary(linkUrl, word);
if (word.contains(".")) {
word = removePeriods(word);
addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word));
}
}
}
}
for (String word: wordParts) {
if (!WordPatterns.isStopWord(word)
&& WordPatterns.filter(word)
&& isNewKeywordForLink(word, linkUrl.toString())
) {
linkKeywordConsumer.accept(linkUrl, word);
}
}
for (String word: wordParts) {
if (word.length() > 2 && word.endsWith("'s")) {
word = word.substring(0, word.length()-2);
}
if (!WordPatterns.isStopWord(word)
&& WordPatterns.filter(word)
&& isNewKeywordForLink(word, linkUrl.toString())
) {
linkKeywordConsumer.accept(linkUrl, word);
}
}
}
private void addKeywordIfExistsInTermFreqDictionary(EdgeUrl linkUrl, String word) {
if (ngramDict.getTermFreq(word) > 0) {
if (isNewKeywordForLink(word, linkUrl.toString())) {
linkKeywordConsumer.accept(linkUrl, word);
}
}
}
Pattern p = Pattern.compile("\\.");
private String removePeriods(String s) {
return p.matcher(s).replaceAll("");
}
private String domainHash(EdgeUrl url) {
var domain = url.domain;
if ("www".equals(domain.subDomain)) {
return domain.domain;
}
return domain.toString();
}
private String trimText(String text) {
int start = text.length()-1;
int end = 0;
for (int i = text.length(); i > 0; i--) {
if (Character.isLetterOrDigit(text.charAt(i-1))) {
end = i;
break;
}
}
for (int i = 0; i < end; i++) {
if (Character.isLetterOrDigit(text.charAt(i))) {
start = i;
break;
}
}
if (start >= 0 && start < end) {
return text.substring(start, end);
}
return "";
}
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
@ -135,7 +246,7 @@ public class AnchorTextExtractor {
return includeDomainPredicate.test(linkUrl.domain.toString());
}
private boolean isNewKeywordForLink(String href, String text) {
private synchronized boolean isNewKeywordForLink(String href, String text) {
long hash = 0;
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong();