mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Merge pull request 'Preparations for anchor tag inclusion' (#70) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/70
This commit is contained in:
commit
914badd777
@ -0,0 +1,115 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class LinkKeywordLoaderMain {
|
||||
|
||||
public static void main(String... args) {
|
||||
|
||||
Map<String, Long> urlToId = getUrls();
|
||||
try (EdgeIndexClient indexClient = new EdgeIndexClient();
|
||||
var lines = Files.lines(Path.of(args[0]))
|
||||
) {
|
||||
lines
|
||||
.map(UrlKeyword::parseLine)
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(new Uploader(urlToId, indexClient));
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private record UrlKeyword(String url, String keyword) {
|
||||
public static UrlKeyword parseLine(String line) {
|
||||
String[] parts = line.split("\t");
|
||||
if (parts.length == 2) {
|
||||
return new UrlKeyword(parts[0], parts[1]);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static class Uploader implements Consumer<UrlKeyword> {
|
||||
private Map<String, Long> urlToId;
|
||||
private final EdgeIndexClient indexClient;
|
||||
|
||||
private Uploader(Map<String, Long> urlToId,
|
||||
EdgeIndexClient indexClient) {
|
||||
this.urlToId = urlToId;
|
||||
this.indexClient = indexClient;
|
||||
}
|
||||
|
||||
String lastLine = null;
|
||||
Set<String> keywords = new HashSet<>(100);
|
||||
|
||||
@Override
|
||||
public void accept(UrlKeyword urlKeyword) {
|
||||
if (urlKeyword == null) return;
|
||||
|
||||
if (lastLine == null) {
|
||||
lastLine = urlKeyword.url;
|
||||
keywords.add(urlKeyword.keyword);
|
||||
}
|
||||
else if (urlKeyword.url.equals(lastLine)) {
|
||||
keywords.add(urlKeyword.keyword);
|
||||
}
|
||||
else {
|
||||
Long id = urlToId.get(lastLine);
|
||||
|
||||
if (id != null) {
|
||||
int urlId = (int)(id & 0xFFFF_FFFFL);
|
||||
int domainId = (int)(id >>> 32L);
|
||||
|
||||
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
|
||||
|
||||
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
|
||||
new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
|
||||
).blockingSubscribe();
|
||||
}
|
||||
|
||||
lastLine = urlKeyword.url;
|
||||
keywords.clear();
|
||||
keywords.add(urlKeyword.keyword);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, Long> getUrls() {
|
||||
|
||||
Map<String, Long> urls = new HashMap<>(100_000);
|
||||
|
||||
try (var ds = new DatabaseModule().provideConnection();
|
||||
var conn = ds.getConnection();
|
||||
var stmt = conn.createStatement())
|
||||
{
|
||||
var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL");
|
||||
|
||||
while (rsp.next()) {
|
||||
long val = rsp.getInt(3);
|
||||
val = (val << 32L) | rsp.getInt(2);
|
||||
|
||||
urls.put(rsp.getString(1), val);
|
||||
}
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return urls;
|
||||
}
|
||||
}
|
@ -5,13 +5,19 @@ import com.google.common.hash.Hashing;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.DenseBitMap;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
@ -30,12 +36,16 @@ public class AnchorTextExtractor {
|
||||
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||
|
||||
private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels());
|
||||
|
||||
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
|
||||
Predicate<EdgeUrl> includeUrlPredicate,
|
||||
BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
|
||||
this.includeDomainPredicate = includeDomainPredicate;
|
||||
this.includeUrlPredicate = includeUrlPredicate;
|
||||
this.linkKeywordConsumer = linkKeywordConsumer;
|
||||
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -70,7 +80,11 @@ public class AnchorTextExtractor {
|
||||
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
|
||||
}
|
||||
|
||||
Set<String> excludedTerminators = Set.of("a", "for", "of", "in", "with", "but", "as", "by", "on", "to", "at", "-");
|
||||
|
||||
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
|
||||
text = trimText(text);
|
||||
|
||||
if (!isInterestingAnchorText(text)) {
|
||||
return;
|
||||
}
|
||||
@ -84,25 +98,122 @@ public class AnchorTextExtractor {
|
||||
return;
|
||||
}
|
||||
|
||||
for (String word: anchorTextNoise.split(text)) {
|
||||
if (WordPatterns.isStopWord(word))
|
||||
continue;
|
||||
if (Objects.equals(domainHash(linkUrl), domainHash(documentUrl))) {
|
||||
return;
|
||||
}
|
||||
|
||||
word = word.toLowerCase();
|
||||
if (!WordPatterns.filter(word)) {
|
||||
continue;
|
||||
String[] wordParts = anchorTextNoise.split(text.toLowerCase());
|
||||
|
||||
if (wordParts.length > 1) {
|
||||
String word = Strings.join(Arrays.asList(wordParts), '_');
|
||||
|
||||
addKeywordIfExistsInTermFreqDictionary(linkUrl, word);
|
||||
|
||||
if (word.contains(".")) {
|
||||
addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word));
|
||||
}
|
||||
|
||||
if (linkUrl.domain.equals(documentUrl.domain)) {
|
||||
continue;
|
||||
if (wordParts.length > 2) {
|
||||
for (int i = 1; i < wordParts.length; i++) {
|
||||
if (excludedTerminators.contains(wordParts[i])) continue;
|
||||
if (excludedTerminators.contains(wordParts[i-1])) continue;
|
||||
|
||||
word = wordParts[i-1] + "_" + wordParts[i];
|
||||
addKeywordIfExistsInTermFreqDictionary(linkUrl, word);
|
||||
|
||||
if (word.contains(".")) {
|
||||
addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (wordParts.length > 3) {
|
||||
for (int i = 2; i < wordParts.length; i++) {
|
||||
if (excludedTerminators.contains(wordParts[i])) continue;
|
||||
if (excludedTerminators.contains(wordParts[i-2])) continue;
|
||||
|
||||
word = wordParts[i-2] + "_" + wordParts[i-1] + "_" + wordParts[i];
|
||||
|
||||
addKeywordIfExistsInTermFreqDictionary(linkUrl, word);
|
||||
|
||||
if (word.contains(".")) {
|
||||
word = removePeriods(word);
|
||||
addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (String word: wordParts) {
|
||||
if (!WordPatterns.isStopWord(word)
|
||||
&& WordPatterns.filter(word)
|
||||
&& isNewKeywordForLink(word, linkUrl.toString())
|
||||
) {
|
||||
linkKeywordConsumer.accept(linkUrl, word);
|
||||
}
|
||||
}
|
||||
|
||||
for (String word: wordParts) {
|
||||
if (word.length() > 2 && word.endsWith("'s")) {
|
||||
word = word.substring(0, word.length()-2);
|
||||
}
|
||||
|
||||
if (!WordPatterns.isStopWord(word)
|
||||
&& WordPatterns.filter(word)
|
||||
&& isNewKeywordForLink(word, linkUrl.toString())
|
||||
) {
|
||||
linkKeywordConsumer.accept(linkUrl, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addKeywordIfExistsInTermFreqDictionary(EdgeUrl linkUrl, String word) {
|
||||
if (ngramDict.getTermFreq(word) > 0) {
|
||||
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
||||
linkKeywordConsumer.accept(linkUrl, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Pattern p = Pattern.compile("\\.");
|
||||
private String removePeriods(String s) {
|
||||
return p.matcher(s).replaceAll("");
|
||||
}
|
||||
|
||||
private String domainHash(EdgeUrl url) {
|
||||
var domain = url.domain;
|
||||
if ("www".equals(domain.subDomain)) {
|
||||
return domain.domain;
|
||||
}
|
||||
return domain.toString();
|
||||
}
|
||||
|
||||
private String trimText(String text) {
|
||||
int start = text.length()-1;
|
||||
int end = 0;
|
||||
|
||||
for (int i = text.length(); i > 0; i--) {
|
||||
if (Character.isLetterOrDigit(text.charAt(i-1))) {
|
||||
end = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < end; i++) {
|
||||
if (Character.isLetterOrDigit(text.charAt(i))) {
|
||||
start = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (start >= 0 && start < end) {
|
||||
return text.substring(start, end);
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
|
||||
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
|
||||
|
||||
@ -135,7 +246,7 @@ public class AnchorTextExtractor {
|
||||
return includeDomainPredicate.test(linkUrl.domain.toString());
|
||||
}
|
||||
|
||||
private boolean isNewKeywordForLink(String href, String text) {
|
||||
private synchronized boolean isNewKeywordForLink(String href, String text) {
|
||||
long hash = 0;
|
||||
|
||||
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong();
|
||||
|
@ -19,7 +19,6 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
@ -266,18 +265,26 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId) {
|
||||
if (urlId.isEmpty())
|
||||
public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlIds) {
|
||||
if (urlIds.isEmpty())
|
||||
return Collections.emptyList();
|
||||
|
||||
List<BrowseResult> ret = new ArrayList<>(urlId.size());
|
||||
List<BrowseResult> ret = new ArrayList<>(urlIds.size());
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.createStatement()) {
|
||||
// this is safe, string concatenation is of integers
|
||||
String inStmt = urlId.stream().map(id -> Integer.toString(id.id())).collect(Collectors.joining(", ", "(", ")"));
|
||||
|
||||
var rsp = stmt.executeQuery("SELECT DOMAIN_ID, DOMAIN_NAME FROM EC_URL_VIEW INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID WHERE KNOWN_URLS<5000 AND QUALITY>-10 AND EC_URL_VIEW.ID IN " + inStmt);
|
||||
String inStmt = idList(urlIds);
|
||||
|
||||
var rsp = stmt.executeQuery("""
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||
FROM EC_URL_VIEW
|
||||
INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID
|
||||
WHERE
|
||||
KNOWN_URLS<5000
|
||||
AND QUALITY>-10
|
||||
AND EC_URL_VIEW.ID IN
|
||||
""" + inStmt); // this injection is safe, inStmt is derived from concatenating a list of integers
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
|
@ -57,18 +57,15 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
|
||||
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex, wordsIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Middle, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.PositionWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex, positionIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.NamesWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, linkIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
|
||||
|
||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
}
|
||||
|
||||
@SafeVarargs
|
||||
@ -121,7 +118,12 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
}
|
||||
|
||||
public Query findWord(IndexBlock block, IndexSearchBudget budget, LongPredicate filter, int wordId) {
|
||||
return queryBuilders.get(block).build(budget, filter, wordId);
|
||||
var builder = queryBuilders.get(block);
|
||||
|
||||
if (builder == null)
|
||||
return Query.EMPTY;
|
||||
|
||||
return builder.build(budget, filter, wordId);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -135,13 +137,20 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
|
||||
@SneakyThrows
|
||||
public long numHits(IndexBlock block, int word) {
|
||||
return numHitsCache.get(Pair.of(block, word),
|
||||
() -> queryBuilders.get(block)
|
||||
.getIndicies()
|
||||
.stream()
|
||||
.mapToLong(idx -> idx.numUrls(word))
|
||||
.sum()
|
||||
);
|
||||
return numHitsCache.get(Pair.of(block, word), () -> numHitsForBlockWord(block, word));
|
||||
}
|
||||
|
||||
private long numHitsForBlockWord(IndexBlock block, int word) {
|
||||
IndexQueryBuilder builder = queryBuilders.get(block);
|
||||
|
||||
if (builder == null)
|
||||
return 0L;
|
||||
|
||||
return builder
|
||||
.getIndicies()
|
||||
.stream()
|
||||
.mapToLong(idx -> idx.numUrls(word))
|
||||
.sum();
|
||||
}
|
||||
|
||||
public IndexBlock getBlockForResult(int searchTerm, long urlId) {
|
||||
|
@ -3,6 +3,17 @@ package nu.marginalia.wmsa.edge.index.reader.query;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public interface Query {
|
||||
Query EMPTY = new Query() {
|
||||
@Override
|
||||
public Query also(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public Query not(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public LongStream stream() { return LongStream.empty(); }
|
||||
};
|
||||
|
||||
Query also(int wordId);
|
||||
Query not(int wordId);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user