From db056be06a6bbb855717037f56bd90e3fc070481 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 24 Aug 2022 22:05:32 +0200 Subject: [PATCH 01/19] WIP logic for detecting significant images in the body of a website. --- .../processor/logic/SalientImageDetector.java | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java new file mode 100644 index 00000000..271ad6f2 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java @@ -0,0 +1,74 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.HashMap; +import java.util.Map; + +public class SalientImageDetector { + + public boolean hasSalientImage(Document document) { + document.getElementsByTag("a").removeIf(Element::hasText); + + Map counts = new HashMap<>(); + for (var elem : document.getElementsByTag("img")) { + counts.merge(elem.attr("src"), 1, Integer::sum); + } + for (var elem : document.select("p,div,section,article,font,center")) { + + String tagName = elem.tagName(); + if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName)) + && elem.text().length() < 16) + { + continue; + } + + if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) { + for (var imgTag : elem.getElementsByTag("img")) { + if (counts.getOrDefault(imgTag.attr("src"), 1) > 1) { + continue; + } + + if (isSmall(imgTag)) { + if (!imgTag.id().isBlank()) { + continue; + } + } + + return true; + } + } + } + + return false; + + } + + private boolean isSmall(Element imgTag) { + final String width = imgTag.attr("width"); + final String height = imgTag.attr("height"); + + if (width.isBlank() || height.isBlank()) + return true; + + try { + if (Integer.parseInt(width) < 400) + return true; + if (Integer.parseInt(height) < 400) + return true; + } + catch (NumberFormatException ex) { /* no-op */ } + + return false; + } + + private double htmlTagDensity(Element elem) { + return (double) elem.text().length() / elem.html().length(); + } + + private double aTagDensity(Element elem) { + return (double) elem.getElementsByTag("a").text().length() / elem.text().length(); + } + +} From 3200c360723760c243b0b9c6ad601a62b2e7563d Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 26 Aug 2022 16:08:46 +0200 Subject: [PATCH 02/19] Experimental changes for 22-08/09 update. --- marginalia_nu/build.gradle | 5 +- .../marginalia/util/dict/DictionaryData.java | 14 +-- .../util/language/WordPatterns.java | 5 + .../processing/DocumentKeywordExtractor.java | 110 ++++++++++------- .../wmsa/client/AbstractClient.java | 28 ++++- .../converting/LinkKeywordLoaderMain.java | 2 +- .../converting/loader/IndexLoadKeywords.java | 2 +- .../processor/DocumentProcessor.java | 7 +- .../converting/processor/logic/DomPruner.java | 111 ++++++++++++++++++ .../wmsa/edge/index/EdgeIndexService.java | 60 +++------- .../edge/index/client/EdgeIndexClient.java | 22 +++- .../journal/SearchIndexJournalWriterImpl.java | 47 ++++++-- .../edge/index/model/EdgePutWordsRequest.java | 10 +- .../wmsa/edge/index/model/IndexBlock.java | 14 ++- .../edge/index/reader/SearchIndexReader.java | 33 ++++-- .../index/reader/query/IndexQueryBuilder.java | 3 +- .../StackOverflowPostProcessor.java | 8 +- .../wikipedia/WikipediaProcessor.java | 8 +- .../edge/model/crawl/EdgePageWordSet.java | 20 +++- .../wmsa/edge/model/crawl/EdgePageWords.java | 20 +++- .../wmsa/edge/search/EdgeSearchProfile.java | 22 ++-- .../command/commands/SiteSearchCommand.java | 2 +- .../search/results/SearchResultValuator.java | 2 +- .../wmsa/edge/tools/FeaturesLoaderTool.java | 2 +- .../main/resources/sql/edge-crawler-cache.sql | 2 +- .../processor/logic/DomPrunerTest.java | 12 ++ .../edge/crawling/SentenceExtractorTest.java | 13 +- .../index/service/EdgeIndexClientTest.java | 13 +- .../service/SearchIndexJournalWriterTest.java | 2 +- protocol/build.gradle | 27 +++++ protocol/def/index.proto | 21 ++++ settings.gradle | 3 +- 32 files changed, 475 insertions(+), 175 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPrunerTest.java create mode 100644 protocol/build.gradle create mode 100644 protocol/def/index.proto diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index 304af5c8..8a8f3e0a 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -58,7 +58,7 @@ jmhJar { } dependencies { implementation project(':third_party') - + implementation project(':protocol') implementation 'org.projectlombok:lombok:1.18.24' annotationProcessor 'org.projectlombok:lombok:1.18.24' @@ -157,6 +157,9 @@ dependencies { jmh 'org.openjdk.jmh:jmh-core:1.35' jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35' + + implementation 'com.dslplatform:dsl-json:1.9.9' + annotationProcessor 'com.dslplatform:dsl-json-processor:1.9.9' } configurations { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java index 9aa953dc..bbb17c51 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java @@ -1,18 +1,18 @@ package nu.marginalia.util.dict; -import nu.marginalia.util.SeekDictionary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.ByteBuffer; import java.nio.LongBuffer; +import java.util.ArrayList; public class DictionaryData { private final int DICTIONARY_BANK_SIZE; private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class); - private final SeekDictionary banks = SeekDictionary.of(DictionaryDataBank::getSize); + private final ArrayList banks = new ArrayList(100); public DictionaryData(int bankSize) { DICTIONARY_BANK_SIZE = bankSize; @@ -20,12 +20,8 @@ public class DictionaryData { banks.add(new DictionaryDataBank(0, bankSize)); } - public int size() { - return banks.end(); - } - public int add(long key) { - var activeBank = banks.last(); + var activeBank = banks.get(banks.size()-1); int rb = activeBank.add(key); if (rb == -1) { @@ -42,10 +38,10 @@ public class DictionaryData { public long getKey(int offset) { - return banks.bankForOffset(offset).getKey(offset); + return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset); } public boolean keyEquals(int offset, long otherKey) { - return banks.bankForOffset(offset).keyEquals(offset, otherKey); + return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey); } private static class DictionaryDataBank { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java index 3a95072b..b7a588db 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java @@ -19,7 +19,12 @@ public class WordPatterns { public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$"); + public static final Pattern singleWordAdditionalPattern = + Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}"); + + public static final Predicate singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate(); public static final Predicate wordQualitiesPredicate = wordPattern.asMatchPredicate(); + public static final Predicate wordAppendixPredicate = wordAppendixPattern.asMatchPredicate(); public static final Predicate wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate); public static final Predicate characterNoisePredicate = characterNoisePattern.asMatchPredicate(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 33b88671..479dcd4c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -8,7 +8,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; -import org.jetbrains.annotations.NotNull; import javax.inject.Inject; import java.util.*; @@ -45,7 +44,6 @@ public class DocumentKeywordExtractor { List wordsNamesRepeated = nameCounter.count(documentLanguageData, 2); List wordsNamesAll = nameCounter.count(documentLanguageData, 1); List subjects = subjectCounter.count(documentLanguageData); - List wordsLongName = longNameCounter.count(documentLanguageData); int totalSize = wordsTfIdf.size(); @@ -61,17 +59,6 @@ public class DocumentKeywordExtractor { var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects); - var words = getSimpleWords(documentLanguageData); - - for (var w : wordsLongName) - words.add(w.word); - for (var w : lowKeywords) - words.remove(w.word); - for (var w : midKeywords) - words.remove(w.word); - for (var w : topKeywords) - words.remove(w.word); - Collection artifacts = getArtifacts(documentLanguageData); var wordSet = new EdgePageWordSet( @@ -85,15 +72,81 @@ public class DocumentKeywordExtractor { new EdgePageWords(IndexBlock.Artifacts, artifacts) ); - wordSet.append(IndexBlock.Words, words); + getSimpleWords(wordSet, documentLanguageData, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); return wordSet; } + private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) { + + int start = 0; + int lengthGoal = 32; + + for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) { + IndexBlock block = blocks[blockIdx]; + Set words = new HashSet<>(lengthGoal+100); + + int pos; + int length = 0; + for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) { + var sent = documentLanguageData.sentences[pos]; + length += sent.length(); + + for (var word : sent) { + if (!word.isStopWord()) { + String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); + if (WordPatterns.singleWordQualitiesPredicate.test(w)) { + words.add(w); + } + } + } + } + wordSet.append(block, words); + start = pos; + lengthGoal+=32; + } + + if (start < documentLanguageData.sentences.length) { + + Map counts = new HashMap<>(documentLanguageData.totalNumWords()); + for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) { + var sent = documentLanguageData.sentences[pos]; + for (var word : sent) { + if (!word.isStopWord()) { + String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); + if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) { + counts.merge(w, 1, Integer::sum); + } + } + } + } + + Set lastSet; + if (counts.size() < 1024) { + lastSet = counts.keySet(); + } + else { + lastSet = counts.entrySet().stream() + .sorted(Comparator.comparing(e -> { + double N = 11820118.; // Number of documents in term freq dictionary + + // Caveat: This is actually the *negated* term score, because the second logarithm has + // its parameter inverted (log(a^b) = b log(a); here b = -1) + return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N); + })) + .map(Map.Entry::getKey) + .limit(1024) + .collect(Collectors.toCollection(LinkedHashSet::new)); + } + + wordSet.append(blocks[blocks.length - 1], lastSet); + } + } + private Collection getArtifacts(DocumentLanguageData documentLanguageData) { Set reps = new HashSet<>(); - for (var sent : documentLanguageData.sentences) { for (var word : sent) { String lc = word.wordLowerCase(); @@ -138,33 +191,6 @@ public class DocumentKeywordExtractor { return ret; } - @NotNull - private Set getSimpleWords(DocumentLanguageData documentLanguageData) { - Map counts = new HashMap<>(documentLanguageData.totalNumWords()); - - for (var sent : documentLanguageData.sentences) { - for (int i = 0; i < sent.length(); i++) { - if (!sent.isStopWord(i)) { - String w = AsciiFlattener.flattenUnicode(sent.wordsLowerCase[i]); - if (counts.containsKey(w) || (WordPatterns.wordQualitiesPredicate.test(w) && WordPatterns.filter(w))) { - counts.merge(w, 1, Integer::sum); - } - } - } - } - - return counts.entrySet().stream() - .sorted(Comparator.comparing(e -> { - double N = 11820118.; // Number of documents in term freq dictionary - - // Caveat: This is actually the *negated* term score, because the second logarithm has - // its parameter inverted (log(a^b) = b log(a); here b = -1) - return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N); - })) - .map(Map.Entry::getKey) - .limit(512).collect(Collectors.toCollection(LinkedHashSet::new)); - } - public EdgePageWords createWords(IndexBlock block, Collection words) { return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet())); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java index 603f57e5..569b7eaa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client; import com.google.gson.Gson; import com.google.gson.GsonBuilder; +import com.google.protobuf.GeneratedMessageV3; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.core.ObservableSource; import io.reactivex.rxjava3.plugins.RxJavaPlugins; @@ -17,8 +18,6 @@ import org.apache.http.HttpHost; import org.apache.logging.log4j.ThreadContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.slf4j.Marker; -import org.slf4j.MarkerFactory; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -186,6 +185,31 @@ public abstract class AbstractClient implements AutoCloseable { .doFinally(() -> ThreadContext.remove("outbound-request")); } + @SneakyThrows + protected synchronized Observable post(Context ctx, String endpoint, GeneratedMessageV3 data) { + + ensureAlive(); + + RequestBody body = RequestBody.create( + MediaType.parse("application/protobuf"), + data.toByteArray()); + + var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build(); + var call = client.newCall(req); + + logInbound(call); + ThreadContext.put("outbound-request", url + endpoint); + try (var rsp = call.execute()) { + logOutbound(rsp); + int code = rsp.code(); + + return validateStatus(code, req).map(HttpStatusCode::new); + } + finally { + ThreadContext.remove("outbound-request"); + } + } + @SneakyThrows protected synchronized Observable postGet(Context ctx, String endpoint, Object data, Class returnType) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java index b68ee68c..02f502d2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java @@ -76,7 +76,7 @@ public class LinkKeywordLoaderMain { // System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords); - indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet( + indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), new EdgePageWordSet( new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0 ).blockingSubscribe(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java index 46d71505..486eb343 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java @@ -39,7 +39,7 @@ public class IndexLoadKeywords implements Runnable { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe(); + client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index).blockingSubscribe(); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index ee106cce..ef88c831 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -147,7 +147,10 @@ public class DocumentProcessor { throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); } - var dld = sentenceExtractor.extractSentences(doc.clone()); + DomPruner domPruner = new DomPruner(); + Document prunedDoc = doc.clone(); + domPruner.prune(prunedDoc, 0.5); + var dld = sentenceExtractor.extractSentences(prunedDoc); checkDocumentLanguage(dld); @@ -192,7 +195,7 @@ public class DocumentProcessor { ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); words.append(IndexBlock.Meta, tagWords); - words.append(IndexBlock.Words, tagWords); + words.append(IndexBlock.Words_1, tagWords); } private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java new file mode 100644 index 00000000..ebe3de66 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java @@ -0,0 +1,111 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; + +import java.util.HashMap; +import java.util.Map; + +public class DomPruner { + + public void prune(Document document, double pruneThreshold) { + PruningVisitor pruningVisitor = new PruningVisitor(); + document.traverse(pruningVisitor); + + pruningVisitor.data.forEach((node, data) -> { + if (data.depth <= 1) { + return; + } + if (data.signalNodeSize == 0) node.remove(); + else if (data.noiseNodeSize > 0 + && data.signalRate() < pruneThreshold + && data.treeSize > 3) { + node.remove(); + } + }); + } + + + + private static class PruningVisitor implements NodeVisitor { + + private final Map data = new HashMap<>(); + private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0); + + @Override + public void head(Node node, int depth) {} + + @Override + public void tail(Node node, int depth) { + final NodeData dataForNode; + + if (node instanceof TextNode tn) { + dataForNode = new NodeData(depth, tn.text().length(), 0); + } + else if (isSignal(node)) { + dataForNode = new NodeData(depth, 0,0); + for (var childNode : node.childNodes()) { + dataForNode.add(data.getOrDefault(childNode, dummy)); + } + } + else { + dataForNode = new NodeData(depth, 0,0); + for (var childNode : node.childNodes()) { + dataForNode.addAsNoise(data.getOrDefault(childNode, dummy)); + } + } + + + + data.put(node, dataForNode); + } + + public boolean isSignal(Node node) { + + if (node instanceof Element e) { + if ("a".equalsIgnoreCase(e.tagName())) + return false; + if ("nav".equalsIgnoreCase(e.tagName())) + return false; + if ("footer".equalsIgnoreCase(e.tagName())) + return false; + if ("header".equalsIgnoreCase(e.tagName())) + return false; + } + + return true; + } + } + + private static class NodeData { + int signalNodeSize; + int noiseNodeSize; + int treeSize = 1; + int depth; + + private NodeData(int depth, int signalNodeSize, int noiseNodeSize) { + this.depth = depth; + this.signalNodeSize = signalNodeSize; + this.noiseNodeSize = noiseNodeSize; + } + + public void add(NodeData other) { + signalNodeSize += other.signalNodeSize; + noiseNodeSize += other.noiseNodeSize; + treeSize += other.treeSize; + } + + public void addAsNoise(NodeData other) { + noiseNodeSize += other.noiseNodeSize + other.signalNodeSize; + treeSize += other.treeSize; + } + + + public double signalRate() { + return signalNodeSize / (double)(signalNodeSize + noiseNodeSize); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index f9ed8ecc..939b625b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -4,11 +4,11 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.inject.Inject; import com.google.inject.name.Named; +import com.google.protobuf.InvalidProtocolBufferException; import gnu.trove.map.TLongIntMap; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TLongIntHashMap; import gnu.trove.set.hash.TIntHashSet; -import io.prometheus.client.Counter; import io.prometheus.client.Histogram; import io.reactivex.rxjava3.schedulers.Schedulers; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; @@ -22,18 +22,16 @@ import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; -import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; +import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq; import org.apache.http.HttpStatus; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; @@ -52,7 +50,7 @@ import static spark.Spark.get; import static spark.Spark.halt; public class EdgeIndexService extends Service { - private static final int SEARCH_BUDGET_TIMEOUT_MS = 100; + private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -66,11 +64,9 @@ public class EdgeIndexService extends Service { .create(); private static final Histogram wmsa_edge_index_query_time - = Histogram.build().name("wmsa_edge_index_query_time").help("-").register(); - private static final Counter wmsa_edge_index_query_count - = Counter.build().name("wmsa_edge_index_query_count").help("-").register(); - private static final Histogram wmsa_edge_index_put_words_time - = Histogram.build().name("wmsa_edge_index_put_words_time").help("-").register(); + = Histogram.build().name("wmsa_edge_index_query_time") + .linearBuckets(50, 50, 15) + .help("-").register(); public static final int DYNAMIC_BUCKET_LENGTH = 7; @@ -162,12 +158,15 @@ public class EdgeIndexService extends Service { indexes.initialize(init); } - private Object putWords(Request request, Response response) { - var putWordsRequest = gson.fromJson(request.body(), EdgePutWordsRequest.class); + private Object putWords(Request request, Response response) throws InvalidProtocolBufferException { + var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes()); - synchronized (this) { - putWords(putWordsRequest.getDomainId(), putWordsRequest.getUrlId(), - putWordsRequest.wordSet, putWordsRequest.getIndex()); + EdgeId domainId = new EdgeId<>(req.getDomain()); + EdgeId urlId = new EdgeId<>(req.getUrl()); + int idx = req.getIndex(); + + for (int ws = 0; ws < req.getWordSetCount(); ws++) { + putWords(domainId, urlId, req.getWordSet(ws), idx); } response.status(HttpStatus.SC_ACCEPTED); @@ -175,26 +174,16 @@ public class EdgeIndexService extends Service { } public void putWords(EdgeId domainId, EdgeId urlId, - EdgePageWordSet wordSet, int idx - ) { - - wmsa_edge_index_put_words_time.time(() -> { - for (EdgePageWords words : wordSet.values()) { - putWords(domainId, urlId, words, idx); - } - }); - - } - - public void putWords(EdgeId domainId, EdgeId urlId, - EdgePageWords words, int idx + IndexPutKeywordsReq.WordSet words, int idx ) { SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); - for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) { + IndexBlock block = IndexBlock.values()[words.getIndex()]; + + for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) { var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); - var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block); + var header = new SearchIndexJournalEntryHeader(domainId, urlId, block); indexWriter.put(header, entry); }; @@ -257,7 +246,6 @@ public class EdgeIndexService extends Service { } finally { wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start); - wmsa_edge_index_query_count.inc(); } } @@ -410,16 +398,6 @@ public class EdgeIndexService extends Service { } - public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId, - int queryDepth, int minHitCount, int maxResults) { - if (!indexes.isValidBucket(bucket)) { - logger.warn("Invalid bucket {}", bucket); - return LongStream.empty(); - } - - return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults); - } - private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) { if (!indexes.isValidBucket(bucket)) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java index 36cd966e..48b1b3ee 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java @@ -9,7 +9,6 @@ import nu.marginalia.wmsa.client.AbstractDynamicClient; import nu.marginalia.wmsa.client.HttpStatusCode; import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; @@ -18,6 +17,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; +import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,13 +37,27 @@ public class EdgeIndexClient extends AbstractDynamicClient { } @CheckReturnValue - public Observable putWords(Context ctx, EdgeId domain, EdgeId url, double quality, + public Observable putWords(Context ctx, EdgeId domain, EdgeId url, EdgePageWordSet wordSet, int writer ) { - EdgePutWordsRequest request = new EdgePutWordsRequest(domain, url, quality, wordSet, writer); - return this.post(ctx, "/words/", request); + var keywordBuilder = + IndexPutKeywordsReq.newBuilder() + .setDomain(domain.id()) + .setUrl(url.id()) + .setIndex(writer); + + for (var set : wordSet.wordSets.values()) { + var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder(); + wordSetBuilder.setIndex(set.block.ordinal()); + wordSetBuilder.addAllWords(set.words); + keywordBuilder.addWordSet(wordSetBuilder.build()); + } + + var req = keywordBuilder.build(); + + return this.post(ctx, "/words/", req); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java index 23c4b481..a11ee5d0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java @@ -15,6 +15,7 @@ import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { @@ -36,6 +37,8 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE); + new Thread(this::journalWriterThread, "Journal Writer").start(); + writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS); Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite)); } @@ -56,25 +59,45 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { } } + private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {} + private final LinkedBlockingQueue writeQueue = new LinkedBlockingQueue<>(512); + @Override @SneakyThrows - public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) { + public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) { + writeQueue.put(new WriteJob(header, entryData)); + } - byteBuffer.clear(); + @SneakyThrows + public void journalWriterThread() { - byteBuffer.putInt(entryData.size()); - byteBuffer.putInt(header.block().id); - byteBuffer.putLong(header.documentId()); + while (true) { + var job = writeQueue.take(); - entryData.write(byteBuffer); + writeEntry(job.header, job.entryData); + } + } + private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) { - byteBuffer.limit(byteBuffer.position()); - byteBuffer.rewind(); + try { + byteBuffer.clear(); - while (byteBuffer.position() < byteBuffer.limit()) - channel.write(byteBuffer); + byteBuffer.putInt(entryData.size()); + byteBuffer.putInt(header.block().id); + byteBuffer.putLong(header.documentId()); - writePositionMarker(); + entryData.write(byteBuffer); + + byteBuffer.limit(byteBuffer.position()); + byteBuffer.rewind(); + + while (byteBuffer.position() < byteBuffer.limit()) + channel.write(byteBuffer); + + writePositionMarker(); + } catch (IOException e) { + throw new RuntimeException(e); + } } @Override @@ -94,13 +117,11 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { } private void writePositionMarker() throws IOException { - var lock = channel.lock(0, 16, false); pos = channel.size(); raf.seek(0); raf.writeLong(pos); raf.writeLong(dictionaryWriter.size()); raf.seek(pos); - lock.release(); } public synchronized void close() throws IOException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java index dc541c5b..2494b99c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java @@ -5,16 +5,16 @@ import lombok.Getter; import lombok.ToString; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; @AllArgsConstructor @Getter @ToString public class EdgePutWordsRequest { - public final EdgeId domainId; - public final EdgeId urlId; - public final double quality; + public EdgeId domainId; + public EdgeId urlId; + public double quality; - public final EdgePageWordSet wordSet; + public EdgePageWordSet wordSet; private int index = 0; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index 819706fd..f35fcf3a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -5,14 +5,18 @@ public enum IndexBlock { Title(1, 1), Link(2, 1.25), Top(3, 2), - Middle(4, 3), - Low(5, 4), - Words(6, 6), + Middle(4, 2.5), + Low(5, 3.0), + Words_1(6, 3.0), Meta(7, 7), - PositionWords(8, 4.5), + Words_2(8, 3.5), NamesWords(9, 5), Artifacts(10, 10), - Topic(11, 0.5); + Topic(11, 0.5), + Words_4(12, 4.0), + Words_8(13, 4.5), + Words_16Plus(14, 7.0), + ; public final int id; public final double sortOrder; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 65a4dafe..cc2927ca 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -29,8 +29,12 @@ public class SearchIndexReader implements AutoCloseable { IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, - IndexBlock.Words, IndexBlock.NamesWords, + IndexBlock.Words_1, + IndexBlock.Words_2, + IndexBlock.Words_4, + IndexBlock.Words_8, + IndexBlock.Words_16Plus, }; @Inject @@ -44,24 +48,29 @@ public class SearchIndexReader implements AutoCloseable { var linkIndex = indices.get(IndexBlock.Link); var titleIndex = indices.get(IndexBlock.Title); var namesIndex = indices.get(IndexBlock.NamesWords); - var positionIndex = indices.get(IndexBlock.PositionWords); var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords); - var wordsIndex = indices.get(IndexBlock.Words); var metaIndex = indices.get(IndexBlock.Meta); var topicIndex = indices.get(IndexBlock.Topic); + var words1 = indices.get(IndexBlock.Words_1); + var words2 = indices.get(IndexBlock.Words_2); + var words4 = indices.get(IndexBlock.Words_4); + var words8 = indices.get(IndexBlock.Words_8); + var words16 = indices.get(IndexBlock.Words_16Plus); + var artifacts = indices.get(IndexBlock.Artifacts); + queryBuilders = new EnumMap<>(IndexBlock.class); underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class); - queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex)); - queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex)); - queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex)); - queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex)); - queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex)); + queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1), words1)); + queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words2), words1)); + queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words4), words1)); + queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words8), words1)); + queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1)); - underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); - underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); - underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); + underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); } @SafeVarargs @@ -157,7 +166,7 @@ public class SearchIndexReader implements AutoCloseable { return block; } } - return IndexBlock.Words; + return IndexBlock.Words_1; } public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java index 1b27ddd0..78e132b3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java @@ -27,7 +27,8 @@ public class IndexQueryBuilder { public Query build(IndexSearchBudget budget, LongPredicate filter, - int wordId) { + int wordId) + { return new QueryForIndices(budget, filter, wordId); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java index 1338473a..a5fb0656 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java @@ -1,13 +1,13 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow; import com.google.inject.Inject; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; -import nu.marginalia.wmsa.edge.model.*; +import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; @@ -46,8 +46,8 @@ public class StackOverflowPostProcessor { var keywords = documentKeywordExtractor.extractKeywords(dld); keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words).addJust("special:wikipedia"); + keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain); + keywords.get(IndexBlock.Words_1).addJust("special:wikipedia"); keywords.get(IndexBlock.Meta).addJust("special:wikipedia"); keywords.get(IndexBlock.Meta).addJust("js:true"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java index df066adb..22536b90 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java @@ -1,13 +1,13 @@ package nu.marginalia.wmsa.edge.integration.wikipedia; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -42,8 +42,8 @@ public class WikipediaProcessor { var keywords = documentKeywordExtractor.extractKeywords(dld); keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words).addJust("special:stackoverflow"); + keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain); + keywords.get(IndexBlock.Words_1).addJust("special:stackoverflow"); keywords.get(IndexBlock.Meta).addJust("special:stackoverflow"); keywords.get(IndexBlock.Meta).addJust("js:true"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java index c4355ae3..6fdaf059 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java @@ -1,13 +1,15 @@ package nu.marginalia.wmsa.edge.model.crawl; +import com.dslplatform.json.JsonObject; +import com.dslplatform.json.JsonWriter; import lombok.Data; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import java.util.*; @Data -public class EdgePageWordSet { - public final Map wordSets; +public class EdgePageWordSet implements JsonObject { + public Map wordSets; public EdgePageWordSet(EdgePageWords... words) { wordSets = new EnumMap<>(IndexBlock.class); @@ -45,4 +47,18 @@ public class EdgePageWordSet { }); return sj.toString(); } + + @Override + public void serialize(JsonWriter writer, boolean minimal) { + writer.writeAscii("["); + boolean first = false; + for (var w : wordSets.values()) { + if (!first) first = true; + else writer.writeAscii(", "); + + w.serialize(writer, minimal); + } + writer.writeAscii("]}"); + + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java index efb20dcc..4a158c25 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java @@ -1,4 +1,7 @@ package nu.marginalia.wmsa.edge.model.crawl; +import com.dslplatform.json.JsonObject; +import com.dslplatform.json.JsonWriter; +import com.dslplatform.json.NumberConverter; import lombok.Getter; import lombok.ToString; import nu.marginalia.wmsa.edge.index.model.IndexBlock; @@ -8,7 +11,7 @@ import java.util.Collection; import java.util.List; @ToString @Getter -public class EdgePageWords { +public class EdgePageWords implements JsonObject { public final IndexBlock block; public final List words = new ArrayList<>(); @@ -31,4 +34,19 @@ public class EdgePageWords { return words.size(); } public void addJust(String word) { words.add(word); } + + @Override + public void serialize(JsonWriter writer, boolean minimal) { + writer.writeAscii("{\"b\":"); + NumberConverter.serialize(block.ordinal(), writer); + writer.writeAscii(", \"w\": ["); + boolean first = false; + for (var word : words) { + if (!first) first = true; + else { writer.writeAscii(","); } + + writer.writeString(word); + } + writer.writeAscii("]}"); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index 64f8f8b1..f9deecd2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -10,25 +10,31 @@ import java.util.stream.Collectors; public enum EdgeSearchProfile { DEFAULT("default", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus + ), 0, 1), MODERN("modern", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus + ), 2), CORPO("corpo", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), - 4, 5, 6, 7), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), + 4, 5, 7), YOLO("yolo", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), 0, 2, 1, 3, 4, 6), CORPO_CLEAN("corpo-clean", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords), 4, 5), ACADEMIA("academia", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords), 3), FOOD("food", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords), 2, 0), ; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java index fafcaa4b..22969872 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java @@ -62,7 +62,7 @@ public class SiteSearchCommand implements SearchCommandInterface { DecoratedSearchResultSet resultSet; Path screenshotPath = null; if (null != domain) { - resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words, 100, 100, "site:"+domain); + resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain); screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java index 8bcd93a6..e82153ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java @@ -30,7 +30,7 @@ public class SearchResultValuator { EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new); if (scores.length == 0) { - return IndexBlock.Words.sortOrder; + return IndexBlock.Words_1.sortOrder; } final double[] weights = getTermWeights(scores); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java index 2e3398da..d3ed948e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java @@ -51,7 +51,7 @@ public class FeaturesLoaderTool { throw new RuntimeException(ex); } - client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, wordSet, 0) + client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), wordSet, 0) .blockingSubscribe(); }); diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index c1dc9aa9..d2def737 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -141,7 +141,7 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS EC_PAGE_DATA.FEATURES AS FEATURES, EC_DOMAIN.IP AS IP, - EC_DOMAIN.STATE AS STATE, + EC_URL.STATE AS STATE, EC_DOMAIN.RANK AS RANK, EC_DOMAIN.STATE AS DOMAIN_STATE FROM EC_URL diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPrunerTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPrunerTest.java new file mode 100644 index 00000000..a0eb5ba5 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPrunerTest.java @@ -0,0 +1,12 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +class DomPrunerTest { + @Test + public void test() throws IOException { + + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java index f78fb757..ea742a93 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java @@ -1,10 +1,8 @@ package nu.marginalia.wmsa.edge.crawling; -import com.zaxxer.hikari.HikariConfig; -import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.KeywordExtractor; @@ -12,11 +10,9 @@ import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import nu.marginalia.util.ranking.BuggyReversePageRank; -import nu.marginalia.util.ranking.BuggyStandardPageRank; +import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -103,6 +99,11 @@ class SentenceExtractorTest { }); reader.join(); } + + @Test + public void testPattern() { + System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches()); + } @Test void extractSentences() throws IOException { var data = Path.of("/home/vlofgren/Code/tmp-data/"); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java index 55015d13..da9206bf 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java @@ -11,12 +11,15 @@ import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import org.junit.jupiter.api.*; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; import org.junit.jupiter.api.parallel.Execution; import org.junit.jupiter.api.parallel.ExecutionMode; import org.junit.jupiter.api.parallel.ResourceAccessMode; @@ -141,7 +144,7 @@ public class EdgeIndexClientTest { void putWords(int didx, int idx, double quality, String... words) { EdgePageWords epw = new EdgePageWords(IndexBlock.Title); epw.addAll(Arrays.asList(words)); - client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx), quality, + client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx), new EdgePageWordSet(epw), 0).blockingSubscribe(); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java index c900f0f6..f0e6ecc0 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java @@ -61,7 +61,7 @@ class SearchIndexJournalWriterTest { void put() throws IOException { writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link), new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 })); - writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words), + writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words_1), new SearchIndexJournalEntry(new long[] { 5, 6, 7 })); writer.forceWrite(); diff --git a/protocol/build.gradle b/protocol/build.gradle new file mode 100644 index 00000000..210a7612 --- /dev/null +++ b/protocol/build.gradle @@ -0,0 +1,27 @@ +plugins { + id "com.google.protobuf" version "0.8.19" + id "java" +} +repositories { + gradlePluginPortal() +} +protobuf { + protoc { + artifact = 'com.google.protobuf:protoc:3.0.0' + } +} + +sourceSets { + main { + java { + srcDirs 'build/generated/source/proto/main/grpc' + srcDirs 'build/generated/source/proto/main/java' + } + } +} + +dependencies { + protobuf files ("def/") + + implementation group: 'com.google.protobuf', name: 'protobuf-java', version: '3.0.0' +} \ No newline at end of file diff --git a/protocol/def/index.proto b/protocol/def/index.proto new file mode 100644 index 00000000..30cf916d --- /dev/null +++ b/protocol/def/index.proto @@ -0,0 +1,21 @@ +syntax = "proto3"; + +option java_package = "nu.wmsa.wmsa.edge.index.proto"; +option java_outer_classname = "IndexProto"; +option java_multiple_files = true; + +message IndexPutKeywordsReq { + int32 domain = 1; + int32 url = 2; + int32 index = 3; + repeated WordSet wordSet = 4; + + message WordSet { + int32 index = 1; + repeated string words = 2; + } +} + +message IndexSearchQueryRsp { + +} \ No newline at end of file diff --git a/settings.gradle b/settings.gradle index cb3868c8..149ff1ea 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1,4 +1,5 @@ rootProject.name = 'wmsa' include 'marginalia_nu' -include 'third_party' \ No newline at end of file +include 'third_party' +include 'protocol' \ No newline at end of file From f4ad7aaf3302691d3feb295ef09978e7fe9e0ac2 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 26 Aug 2022 20:48:44 +0200 Subject: [PATCH 03/19] Remove accidental import of an unused library, fix build on jdk18-systems. --- marginalia_nu/build.gradle | 2 -- .../edge/model/crawl/EdgePageWordSet.java | 17 +---------------- .../wmsa/edge/model/crawl/EdgePageWords.java | 19 +------------------ protocol/build.gradle | 6 ++++++ 4 files changed, 8 insertions(+), 36 deletions(-) diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index 8a8f3e0a..1be5b722 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -158,8 +158,6 @@ dependencies { jmh 'org.openjdk.jmh:jmh-core:1.35' jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35' - implementation 'com.dslplatform:dsl-json:1.9.9' - annotationProcessor 'com.dslplatform:dsl-json-processor:1.9.9' } configurations { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java index 6fdaf059..10ee3542 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java @@ -1,14 +1,12 @@ package nu.marginalia.wmsa.edge.model.crawl; -import com.dslplatform.json.JsonObject; -import com.dslplatform.json.JsonWriter; import lombok.Data; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import java.util.*; @Data -public class EdgePageWordSet implements JsonObject { +public class EdgePageWordSet { public Map wordSets; public EdgePageWordSet(EdgePageWords... words) { @@ -48,17 +46,4 @@ public class EdgePageWordSet implements JsonObject { return sj.toString(); } - @Override - public void serialize(JsonWriter writer, boolean minimal) { - writer.writeAscii("["); - boolean first = false; - for (var w : wordSets.values()) { - if (!first) first = true; - else writer.writeAscii(", "); - - w.serialize(writer, minimal); - } - writer.writeAscii("]}"); - - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java index 4a158c25..bc97d6aa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java @@ -1,7 +1,4 @@ package nu.marginalia.wmsa.edge.model.crawl; -import com.dslplatform.json.JsonObject; -import com.dslplatform.json.JsonWriter; -import com.dslplatform.json.NumberConverter; import lombok.Getter; import lombok.ToString; import nu.marginalia.wmsa.edge.index.model.IndexBlock; @@ -11,7 +8,7 @@ import java.util.Collection; import java.util.List; @ToString @Getter -public class EdgePageWords implements JsonObject { +public class EdgePageWords{ public final IndexBlock block; public final List words = new ArrayList<>(); @@ -35,18 +32,4 @@ public class EdgePageWords implements JsonObject { } public void addJust(String word) { words.add(word); } - @Override - public void serialize(JsonWriter writer, boolean minimal) { - writer.writeAscii("{\"b\":"); - NumberConverter.serialize(block.ordinal(), writer); - writer.writeAscii(", \"w\": ["); - boolean first = false; - for (var word : words) { - if (!first) first = true; - else { writer.writeAscii(","); } - - writer.writeString(word); - } - writer.writeAscii("]}"); - } } diff --git a/protocol/build.gradle b/protocol/build.gradle index 210a7612..b03f8d7e 100644 --- a/protocol/build.gradle +++ b/protocol/build.gradle @@ -5,6 +5,12 @@ plugins { repositories { gradlePluginPortal() } + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} protobuf { protoc { artifact = 'com.google.protobuf:protoc:3.0.0' From c865d6c6b2088af03334221d70098dcb860c6cad Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 27 Aug 2022 11:38:29 +0200 Subject: [PATCH 04/19] Change TF-IDF normalization to reduce the amount of not-so-relevant matches. --- .../util/language/DocumentDebugger.java | 2 +- .../processing/DocumentKeywordExtractor.java | 26 ++++------ .../language/processing/KeywordCounter.java | 47 ++++++++++++------- .../wmsa/edge/index/model/IndexBlock.java | 22 +++++---- .../edge/index/reader/SearchIndexReader.java | 16 +++---- .../wmsa/edge/search/EdgeSearchProfile.java | 14 +++--- 6 files changed, 68 insertions(+), 59 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java index d4c0232e..385e78fb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java @@ -69,7 +69,7 @@ public class DocumentDebugger { Set reps = new HashSet<>(); // kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed)); - kc.count(languageData).forEach(rep -> reps.add(rep.stemmed)); +// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed)); try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 479dcd4c..5511a9be 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -40,35 +40,27 @@ public class DocumentKeywordExtractor { List titleWords = extractTitleWords(documentLanguageData); - List wordsTfIdf = tfIdfCounter.count(documentLanguageData); + KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData); List wordsNamesRepeated = nameCounter.count(documentLanguageData, 2); List wordsNamesAll = nameCounter.count(documentLanguageData, 1); List subjects = subjectCounter.count(documentLanguageData); - int totalSize = wordsTfIdf.size(); + List lowKeywords = new ArrayList<>(wordsTfIdf.lower()); + List midKeywords = new ArrayList<>(wordsTfIdf.mid()); + List topKeywords = new ArrayList<>(wordsTfIdf.top()); - List lowKeywords = new ArrayList<>(totalSize / 2); - List midKeywords = new ArrayList<>(totalSize / 2); - List topKeywords = new ArrayList<>(totalSize / 2); - - for(var v : wordsTfIdf) { - if (topKeywords.size() <= totalSize / 10) topKeywords.add(v); - else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v); - else lowKeywords.add(v); - } - - var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects); + var wordsToMatchWithTitle = joinWordLists(topKeywords, wordsNamesRepeated, subjects); Collection artifacts = getArtifacts(documentLanguageData); var wordSet = new EdgePageWordSet( createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)), - createWords(IndexBlock.Topic, subjects), + createWords(IndexBlock.Subjects, subjects), createWords(IndexBlock.Title, titleWords), createWords(IndexBlock.NamesWords, wordsNamesAll), - createWords(IndexBlock.Top, topKeywords), - createWords(IndexBlock.Middle, midKeywords), - createWords(IndexBlock.Low, lowKeywords), + createWords(IndexBlock.Tfidf_Top, topKeywords), + createWords(IndexBlock.Tfidf_Middle, midKeywords), + createWords(IndexBlock.Tfidf_Lower, lowKeywords), new EdgePageWords(IndexBlock.Artifacts, artifacts) ); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java index 49cee9bf..a0406abe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java @@ -4,9 +4,11 @@ import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; -import java.util.*; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; import java.util.regex.Pattern; -import java.util.stream.Collectors; public class KeywordCounter { private final KeywordExtractor keywordExtractor; @@ -17,10 +19,11 @@ public class KeywordCounter { this.keywordExtractor = keywordExtractor; } - public List count(DocumentLanguageData dld) { + public WordHistogram countHisto(DocumentLanguageData dld) { HashMap counts = new HashMap<>(1000); HashMap> instances = new HashMap<>(1000); + for (var sent : dld.sentences) { var keywords = keywordExtractor.getKeywordsFromSentence(sent); for (var span : keywords) { @@ -32,34 +35,44 @@ public class KeywordCounter { } } - return counts.entrySet().stream() - .filter(e -> e.getValue() > 1) - .sorted(Comparator.comparing(this::getTermValue)) - .map(Map.Entry::getKey) - .flatMap(w -> instances.get(w).stream()) - .filter(w -> w.word.length() > 1) - .limit(150) - .collect(Collectors.toList()); + double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1); + + Set h5 = new HashSet<>(); + Set h10 = new HashSet<>(); + Set h15 = new HashSet<>(); + + for (var entry : counts.entrySet()) { + double value = getTermValue(entry, maxC); + Set histogram; + if (value < -3) histogram = h15; + else if (value < -2) histogram = h10; + else if (value < -1) histogram = h5; + else continue; + + histogram.addAll(instances.get(entry.getKey())); + } + + return new WordHistogram(h5, h10, h15); } private static final Pattern separator = Pattern.compile("_"); - public double getTermValue(Map.Entry e) { + public double getTermValue(Map.Entry e, double maxValue) { String[] parts = separator.split(e.getKey()); double totalValue = 0.; for (String part : parts) { - totalValue += value(part, e.getValue()); + totalValue += value(part, e.getValue(), maxValue); } - return totalValue / Math.sqrt(parts.length); + return totalValue / parts.length; } - double value(String key, double value) { + double value(String key, double value, double maxValue) { double freq = dict.getTermFreqStemmed(key); if (freq < 1) { freq = 10; } - return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.); + return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/11820118.); } - + public record WordHistogram(Set lower, Set mid, Set top) { } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index f35fcf3a..f906265a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -3,16 +3,20 @@ package nu.marginalia.wmsa.edge.index.model; public enum IndexBlock { TitleKeywords(0, 0), Title(1, 1), + Link(2, 1.25), - Top(3, 2), - Middle(4, 2.5), - Low(5, 3.0), - Words_1(6, 3.0), - Meta(7, 7), - Words_2(8, 3.5), - NamesWords(9, 5), - Artifacts(10, 10), - Topic(11, 0.5), + + Subjects(3, 0.5), + NamesWords(4, 5), + Artifacts(5, 10), + Meta(6, 7), + + Tfidf_Top(7, 2), + Tfidf_Middle(8, 2.5), + Tfidf_Lower(9, 5.0), + + Words_1(10, 3.0), + Words_2(11, 3.5), Words_4(12, 4.0), Words_8(13, 4.5), Words_16Plus(14, 7.0), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index cc2927ca..3816ec39 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -26,9 +26,9 @@ public class SearchIndexReader implements AutoCloseable { private final Logger logger = LoggerFactory.getLogger(getClass()); private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] { - IndexBlock.Top, - IndexBlock.Middle, - IndexBlock.Low, + IndexBlock.Tfidf_Top, + IndexBlock.Tfidf_Middle, + IndexBlock.Tfidf_Lower, IndexBlock.NamesWords, IndexBlock.Words_1, IndexBlock.Words_2, @@ -42,15 +42,15 @@ public class SearchIndexReader implements AutoCloseable { EnumMap indices) { this.indices = indices; - var lowIndex = indices.get(IndexBlock.Low); - var midIndex = indices.get(IndexBlock.Middle); - var topIndex = indices.get(IndexBlock.Top); + var lowIndex = indices.get(IndexBlock.Tfidf_Lower); + var midIndex = indices.get(IndexBlock.Tfidf_Middle); + var topIndex = indices.get(IndexBlock.Tfidf_Top); var linkIndex = indices.get(IndexBlock.Link); var titleIndex = indices.get(IndexBlock.Title); var namesIndex = indices.get(IndexBlock.NamesWords); var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords); var metaIndex = indices.get(IndexBlock.Meta); - var topicIndex = indices.get(IndexBlock.Topic); + var topicIndex = indices.get(IndexBlock.Subjects); var words1 = indices.get(IndexBlock.Words_1); var words2 = indices.get(IndexBlock.Words_2); @@ -70,7 +70,7 @@ public class SearchIndexReader implements AutoCloseable { underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); - underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); } @SafeVarargs diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index f9deecd2..fb607e16 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -10,31 +10,31 @@ import java.util.stream.Collectors; public enum EdgeSearchProfile { DEFAULT("default", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link, + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus ), 0, 1), MODERN("modern", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords, + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus ), 2), CORPO("corpo", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords, + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), 4, 5, 7), YOLO("yolo", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords, + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), 0, 2, 1, 3, 4, 6), CORPO_CLEAN("corpo-clean", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), 4, 5), ACADEMIA("academia", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), 3), FOOD("food", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords), 2, 0), ; From 028215697933340fe7a76a178c4465a7c7289929 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 27 Aug 2022 19:19:16 +0200 Subject: [PATCH 05/19] WIP n-gram loader --- .../wmsa/edge/assistant/dict/NGramDict.java | 68 ++++++++++++++----- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java index e48b2ec5..1c2f1c8a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java @@ -2,7 +2,16 @@ package nu.marginalia.wmsa.edge.assistant.dict; import ca.rmen.porterstemmer.PorterStemmer; import gnu.trove.map.hash.TLongIntHashMap; +import nu.marginalia.util.language.LanguageFilter; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner; +import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; +import opennlp.tools.langdetect.LanguageDetector; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -12,7 +21,7 @@ import javax.inject.Singleton; import java.io.*; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Arrays; +import java.util.*; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -55,30 +64,53 @@ public class NGramDict { } - public static void main(String... args) { + public static void main(String... args) throws IOException { if (args.length != 2) { - System.err.println("Expected arguments: in-file out-file"); + System.err.println("Expected arguments: plan.yaml out-file"); } String inFile = args[0]; String outFile = args[1]; - var wordPattern = Pattern.compile("\\w+(_\\w+)*").asMatchPredicate(); - try (var linesStr = Files.lines(Path.of(inFile)); - var dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))) - ) { - linesStr - .filter(wordPattern) - .mapToLong(NGramDict::getStringHash).forEach(l -> - { - try { - dos.writeLong(l); - } catch (IOException e) { - e.printStackTrace(); + var plan = new CrawlPlanLoader().load(Path.of(args[0])); + + SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + DomPruner pruner = new DomPruner(); + LanguageFilter lf = new LanguageFilter(); + + Map counts = new HashMap<>(100_000_000); + + for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine + + if (domain.doc == null) + continue; + + for (var doc : domain.doc) { + if (doc.documentBody == null) + continue; + + Document parsed = Jsoup.parse(doc.documentBody); + pruner.prune(parsed, 0.5); + + DocumentLanguageData dld = se.extractSentences(parsed); + + if (lf.dictionaryAgreement(dld) < 0.1) { + continue; } - }); - } catch (IOException e) { - e.printStackTrace(); + + for (var sent : dld.sentences) { + for (var word : sent) { + counts.merge(word.stemmed(), 1, Integer::sum); + } + } + } } + + counts.forEach((w,c) -> { + if (c > 3) { + System.out.println(w + ":" + c); + } + }); + } public static long getStringHash(String s) { From 3f2854a5e910b6d7ef809ecf0095bfe059e2f195 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 27 Aug 2022 20:30:18 +0200 Subject: [PATCH 06/19] WIP n-gram loader --- .../marginalia/wmsa/edge/assistant/dict/NGramDict.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java index 1c2f1c8a..992ddbba 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java @@ -78,6 +78,7 @@ public class NGramDict { LanguageFilter lf = new LanguageFilter(); Map counts = new HashMap<>(100_000_000); + Set words = new HashSet<>(10_000); for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine @@ -97,11 +98,18 @@ public class NGramDict { continue; } + for (var sent : dld.sentences) { for (var word : sent) { - counts.merge(word.stemmed(), 1, Integer::sum); + words.add(word.stemmed()); } } + + for (var word : words) { + counts.merge(word, 1, Integer::sum); + } + + words.clear(); } } From 813399401e25c8d944689304a1fa3590b0c016f2 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 29 Aug 2022 18:01:07 +0200 Subject: [PATCH 07/19] Tweaks for search result relevance --- .../java/nu/marginalia/util/DenseBitMap.java | 29 +++ .../util/language/DocumentDebugger.java | 4 +- .../util/language/conf/LanguageModels.java | 5 +- .../processing/DocumentKeywordExtractor.java | 42 +++-- .../language/processing/KeywordCounter.java | 10 +- .../language/processing/KeywordExtractor.java | 4 +- .../language/processing/LongNameCounter.java | 9 +- .../wmsa/configuration/WmsaHome.java | 2 +- .../edge/assistant/dict/NGramBloomFilter.java | 78 ++++++++ ...{NGramDict.java => TermFrequencyDict.java} | 117 ++++++++---- .../edge/assistant/suggest/Suggestions.java | 10 +- .../converting/atags/AnchorTextExtractor.java | 4 +- .../model/DisqualifiedException.java | 3 +- .../processor/DocumentProcessor.java | 29 ++- .../converting/processor/DomainProcessor.java | 42 ++++- .../converting/processor/logic/DomPruner.java | 169 +++++++++--------- .../processor/logic/FeatureExtractor.java | 25 ++- .../processor/logic/HtmlFeature.java | 2 + .../processor/logic/topic/RecipeDetector.java | 2 + .../logic/topic/TextileCraftDetector.java | 2 + .../logic/topic/WoodworkingDetector.java | 2 + .../journal/KeywordLexiconJournalFile.java | 2 +- .../wmsa/edge/index/model/IndexBlock.java | 14 +- .../edge/index/reader/SearchIndexReader.java | 16 +- .../edge/model/crawl/EdgePageWordSet.java | 3 +- .../edge/search/query/EnglishDictionary.java | 6 +- .../wmsa/edge/search/query/QueryFactory.java | 11 +- .../wmsa/edge/search/query/QueryVariants.java | 29 +-- .../search/results/SearchResultValuator.java | 6 +- .../main/resources/templates/edge/index.hdb | 2 +- .../marginalia/util/TestLanguageModels.java | 4 +- .../assistant/suggest/SuggestionsTest.java | 6 +- .../edge/crawling/SentenceExtractorTest.java | 10 +- .../integration/arxiv/ArxivParserTest.java | 4 +- .../stackoverflow/StackOverflowPostsTest.java | 6 +- .../integration/wikipedia/WikipediaTest.java | 8 +- .../search/query/BodyQueryParserTest.java | 16 +- .../search/query/EnglishDictionaryTest.java | 4 +- .../edge/search/query/QueryParserTest.java | 14 +- .../edge/search/query/QueryVariantsTest.java | 12 +- 40 files changed, 509 insertions(+), 254 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramBloomFilter.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/{NGramDict.java => TermFrequencyDict.java} (55%) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java index 39b34048..88d2fa18 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java @@ -1,6 +1,10 @@ package nu.marginalia.util; +import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; public class DenseBitMap { public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8; @@ -15,6 +19,31 @@ public class DenseBitMap { this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0))); } + public static DenseBitMap loadFromFile(Path file) throws IOException { + long size = Files.size(file); + var dbm = new DenseBitMap(size/8); + + try (var bc = Files.newByteChannel(file)) { + while (dbm.buffer.position() < dbm.buffer.capacity()) { + bc.read(dbm.buffer); + } + } + dbm.buffer.clear(); + + return dbm; + } + + public void writeToFile(Path file) throws IOException { + + try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { + while (buffer.position() < buffer.capacity()) { + bc.write(buffer); + } + } + + buffer.clear(); + } + public boolean get(long pos) { return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java index 385e78fb..fb081c95 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java @@ -8,7 +8,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.jsoup.nodes.Document; import java.io.FileNotFoundException; @@ -30,7 +30,7 @@ public class DocumentDebugger { Path tempDir; public DocumentDebugger(LanguageModels lm) throws IOException { se = new SentenceExtractor(lm); - var dict = new NGramDict(lm); + var dict = new TermFrequencyDict(lm); ke = new KeywordExtractor(); kc = new KeywordCounter(dict, ke); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/conf/LanguageModels.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/conf/LanguageModels.java index c306b922..65cb21cb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/conf/LanguageModels.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/conf/LanguageModels.java @@ -6,8 +6,9 @@ import java.nio.file.Path; @AllArgsConstructor public class LanguageModels { - public final Path ngramDictionary; - public final Path ngramFrequency; + public final Path ngramBloomFilter; + public final Path termFrequencies; + public final Path openNLPSentenceDetectionData; public final Path posRules; public final Path posDict; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 5511a9be..0eed6a2e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -4,7 +4,7 @@ import com.google.common.collect.Sets; import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; @@ -19,29 +19,54 @@ public class DocumentKeywordExtractor { private final KeywordExtractor keywordExtractor; private final KeywordCounter tfIdfCounter; private final NameCounter nameCounter; - private final LongNameCounter longNameCounter; private final SubjectCounter subjectCounter; - private final NGramDict dict; + private final TermFrequencyDict dict; + private final double docCount; @Inject - public DocumentKeywordExtractor(NGramDict dict) { + public DocumentKeywordExtractor(TermFrequencyDict dict) { this.dict = dict; + docCount = dict.docCount(); keywordExtractor = new KeywordExtractor(); tfIdfCounter = new KeywordCounter(dict, keywordExtractor); nameCounter = new NameCounter(keywordExtractor); - longNameCounter = new LongNameCounter(dict, keywordExtractor); subjectCounter = new SubjectCounter(keywordExtractor); } + + public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) { + + List titleWords = extractTitleWords(documentLanguageData); + + KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData); + List wordsNamesAll = nameCounter.count(documentLanguageData, 1); + List subjects = subjectCounter.count(documentLanguageData); + + List midKeywords = new ArrayList<>(wordsTfIdf.mid()); + List topKeywords = new ArrayList<>(wordsTfIdf.top()); + + Collection artifacts = getArtifacts(documentLanguageData); + + return new EdgePageWordSet( + createWords(IndexBlock.Subjects, subjects), + createWords(IndexBlock.Title, titleWords), + createWords(IndexBlock.NamesWords, wordsNamesAll), + createWords(IndexBlock.Tfidf_Top, topKeywords), + createWords(IndexBlock.Tfidf_Middle, midKeywords), + new EdgePageWords(IndexBlock.Artifacts, artifacts) + ); + } + + + public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) { List titleWords = extractTitleWords(documentLanguageData); KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData); - List wordsNamesRepeated = nameCounter.count(documentLanguageData, 2); List wordsNamesAll = nameCounter.count(documentLanguageData, 1); List subjects = subjectCounter.count(documentLanguageData); @@ -49,12 +74,9 @@ public class DocumentKeywordExtractor { List midKeywords = new ArrayList<>(wordsTfIdf.mid()); List topKeywords = new ArrayList<>(wordsTfIdf.top()); - var wordsToMatchWithTitle = joinWordLists(topKeywords, wordsNamesRepeated, subjects); - Collection artifacts = getArtifacts(documentLanguageData); var wordSet = new EdgePageWordSet( - createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)), createWords(IndexBlock.Subjects, subjects), createWords(IndexBlock.Title, titleWords), createWords(IndexBlock.NamesWords, wordsNamesAll), @@ -121,7 +143,7 @@ public class DocumentKeywordExtractor { else { lastSet = counts.entrySet().stream() .sorted(Comparator.comparing(e -> { - double N = 11820118.; // Number of documents in term freq dictionary + double N = docCount; // Number of documents in term freq dictionary // Caveat: This is actually the *negated* term score, because the second logarithm has // its parameter inverted (log(a^b) = b log(a); here b = -1) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java index a0406abe..4217b743 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java @@ -2,7 +2,7 @@ package nu.marginalia.util.language.processing; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import java.util.HashMap; import java.util.HashSet; @@ -12,11 +12,13 @@ import java.util.regex.Pattern; public class KeywordCounter { private final KeywordExtractor keywordExtractor; - private final NGramDict dict; + private final TermFrequencyDict dict; + private final double docCount; - public KeywordCounter(NGramDict dict, KeywordExtractor keywordExtractor) { + public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) { this.dict = dict; this.keywordExtractor = keywordExtractor; + this.docCount = (double) dict.docCount(); } public WordHistogram countHisto(DocumentLanguageData dld) { @@ -71,7 +73,7 @@ public class KeywordCounter { if (freq < 1) { freq = 10; } - return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/11820118.); + return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/docCount); } public record WordHistogram(Set lower, Set mid, Set top) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java index 0dba8e53..ea68c63e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java @@ -1,9 +1,9 @@ package nu.marginalia.util.language.processing; +import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import nu.marginalia.util.language.WordPatterns; import java.lang.ref.SoftReference; import java.util.ArrayList; @@ -377,4 +377,6 @@ public class KeywordExtractor { return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG")); } + + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java index 3943e046..e2dfd3ad 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java @@ -3,7 +3,7 @@ package nu.marginalia.util.language.processing; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import java.util.*; import java.util.regex.Pattern; @@ -11,10 +11,11 @@ import java.util.stream.Collectors; public class LongNameCounter { private final KeywordExtractor keywordExtractor; - - private final NGramDict dict; - public LongNameCounter(NGramDict dict, KeywordExtractor keywordExtractor) { + private final TermFrequencyDict dict; + private final double docCount; + public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) { this.dict = dict; + docCount = (double) dict.docCount(); this.keywordExtractor = keywordExtractor; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java index f82b9527..877ca129 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java @@ -87,7 +87,7 @@ public class WmsaHome { final Path home = getHomePath(); return new LanguageModels( - home.resolve("model/ngrams-generous-emstr.bin"), + home.resolve("model/ngrams.bin"), home.resolve("model/tfreq-new-algo3.bin"), home.resolve("model/opennlp-sentence.bin"), home.resolve("model/English.RDR"), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramBloomFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramBloomFilter.java new file mode 100644 index 00000000..2eef7d39 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramBloomFilter.java @@ -0,0 +1,78 @@ +package nu.marginalia.wmsa.edge.assistant.dict; + +import ca.rmen.porterstemmer.PorterStemmer; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.google.inject.Inject; +import nu.marginalia.util.DenseBitMap; +import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournalFile; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Pattern; + +public class NGramBloomFilter { + private final DenseBitMap bitMap; + private static final PorterStemmer ps = new PorterStemmer(); + private static final HashFunction hasher = Hashing.murmur3_128(0); + + @Inject + public NGramBloomFilter() throws IOException { + this(WmsaHome.getLanguageModels()); + } + + public NGramBloomFilter(LanguageModels lm) throws IOException { + this(DenseBitMap.loadFromFile(lm.ngramBloomFilter)); + } + + public NGramBloomFilter(DenseBitMap bitMap) { + this.bitMap = bitMap; + } + + public boolean isKnownNGram(String word) { + long bit = bitForWord(word, bitMap.cardinality); + + return bitMap.get(bit); + } + + public static void main(String... args) throws IOException { + var filter = convertFromDictionaryFile(new File(args[0])); + filter.bitMap.writeToFile(Path.of(args[1])); + } + + public static NGramBloomFilter load(Path file) throws IOException { + return new NGramBloomFilter(DenseBitMap.loadFromFile(file)); + } + + public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException { + DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L); + AtomicInteger popCount = new AtomicInteger(); + try (var f = new KeywordLexiconJournalFile(file)) { + f.loadFile(data -> { + long bit = bitForWord(new String(data), bitMap.cardinality); + if (!bitMap.set(bit)) + popCount.incrementAndGet(); + }); + } + + System.out.println("popcount = " + popCount.get()); + return new NGramBloomFilter(bitMap); + } + + private static final Pattern underscore = Pattern.compile("_"); + + private static long bitForWord(String s, long n) { + String[] parts = underscore.split(s); + long hc = 0; + for (String part : parts) { + hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong(); + } + return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java similarity index 55% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java index 992ddbba..d219b30d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java @@ -9,7 +9,6 @@ import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import opennlp.tools.langdetect.LanguageDetector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; @@ -21,12 +20,17 @@ import javax.inject.Singleton; import java.io.*; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import java.util.stream.Collectors; @Singleton -public class NGramDict { +public class TermFrequencyDict { private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0); @@ -34,21 +38,22 @@ public class NGramDict { private static final Pattern separator = Pattern.compile("[_ ]+"); private static final PorterStemmer ps = new PorterStemmer(); + private static final long DOC_COUNT_KEY = ~0L; private static long fileSize(Path p) throws IOException { return Files.size(p); } @Inject - public NGramDict(@Nullable LanguageModels models) { + public TermFrequencyDict(@Nullable LanguageModels models) { if (models == null) { return; } - if (models.ngramFrequency != null) { + if (models.termFrequencies != null) { - try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.ngramFrequency.toFile())))) { + try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) { - wordRates.ensureCapacity((int)(fileSize(models.ngramFrequency)/16)); + wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16)); for (;;) { wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong()); @@ -56,7 +61,7 @@ public class NGramDict { } catch (EOFException eof) { // ok } catch (IOException e) { - logger.error("IO Exception reading " + models.ngramFrequency, e); + logger.error("IO Exception reading " + models.termFrequencies, e); } } @@ -64,60 +69,100 @@ public class NGramDict { } - public static void main(String... args) throws IOException { + public int docCount() { + int cnt = wordRates.get(DOC_COUNT_KEY); + + if (cnt == 0) { + cnt = 11820118; // legacy + } + return cnt; + } + + public static void main(String... args) throws IOException, InterruptedException { if (args.length != 2) { System.err.println("Expected arguments: plan.yaml out-file"); } - String inFile = args[0]; String outFile = args[1]; var plan = new CrawlPlanLoader().load(Path.of(args[0])); - SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + ThreadLocal se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels())); DomPruner pruner = new DomPruner(); LanguageFilter lf = new LanguageFilter(); - Map counts = new HashMap<>(100_000_000); - Set words = new HashSet<>(10_000); + TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); + + ForkJoinPool fjp = new ForkJoinPool(24); + AtomicInteger docCount = new AtomicInteger(); for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine if (domain.doc == null) continue; - for (var doc : domain.doc) { - if (doc.documentBody == null) - continue; + fjp.execute(() -> { - Document parsed = Jsoup.parse(doc.documentBody); - pruner.prune(parsed, 0.5); + for (var doc : domain.doc) { + if (doc.documentBody == null) + continue; + docCount.incrementAndGet(); - DocumentLanguageData dld = se.extractSentences(parsed); + Document parsed = Jsoup.parse(doc.documentBody); + pruner.prune(parsed, 0.5); - if (lf.dictionaryAgreement(dld) < 0.1) { - continue; - } + DocumentLanguageData dld = se.get().extractSentences(parsed); - - for (var sent : dld.sentences) { - for (var word : sent) { - words.add(word.stemmed()); + if (lf.dictionaryAgreement(dld) < 0.1) { + return; } - } - for (var word : words) { - counts.merge(word, 1, Integer::sum); - } + Set words = new HashSet<>(10_000); - words.clear(); + for (var sent : dld.sentences) { + for (var word : sent) { + words.add(word.stemmed()); + } + } + + fjp.execute(() -> { + synchronized (counts) { + for (var word : words) { + counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1); + } + } + }); + + } + }); + } + + fjp.shutdown(); + fjp.awaitTermination(10, TimeUnit.SECONDS); + + try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) { + synchronized (counts) { + counts.put(DOC_COUNT_KEY, docCount.get()); + + counts.forEachEntry((hash, cnt) -> { + try { + dos.writeLong(hash); + dos.writeLong(cnt); + } catch (IOException e) { + throw new RuntimeException(e); + } + return true; + }); } } - counts.forEach((w,c) -> { - if (c > 3) { - System.out.println(w + ":" + c); - } - }); + System.out.println(docCount.get()); +// +// counts.forEachEntry((w,c) -> { +// if (c > 3L) { +// System.out.println(w + ":" + c); +// } +// return true; +// }); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java index b8284420..ff793015 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java @@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.assistant.suggest; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import org.apache.commons.collections4.trie.PatriciaTrie; import org.slf4j.Logger; @@ -21,7 +21,7 @@ import java.util.stream.Stream; public class Suggestions { private final PatriciaTrie suggestionsTrie; - private final NGramDict nGramDict; + private final TermFrequencyDict termFrequencyDict; private final SpellChecker spellChecker; private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$"); @@ -31,12 +31,12 @@ public class Suggestions { @Inject public Suggestions(@Named("suggestions-file") Path suggestionsFile, SpellChecker spellChecker, - NGramDict dict + TermFrequencyDict dict ) { this.spellChecker = spellChecker; suggestionsTrie = loadSuggestions(suggestionsFile); - nGramDict = dict; + termFrequencyDict = dict; logger.info("Loaded {} suggestions", suggestionsTrie.size()); } @@ -138,7 +138,7 @@ public class Suggestions { } Map scach = new HashMap<>(512); - Function valr = s -> -nGramDict.getTermFreqHash(scach.computeIfAbsent(s, NGramDict::getStringHash)); + Function valr = s -> -termFrequencyDict.getTermFreqHash(scach.computeIfAbsent(s, TermFrequencyDict::getStringHash)); return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey) .takeWhile(s -> s.startsWith(prefix)) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java index 6d4927fb..744281a5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java @@ -6,7 +6,7 @@ import lombok.SneakyThrows; import nu.marginalia.util.DenseBitMap; import nu.marginalia.util.language.WordPatterns; import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.apache.logging.log4j.util.Strings; @@ -36,7 +36,7 @@ public class AnchorTextExtractor { // de-duplicating billions of shuffled (url, word) tuples on limited hardware private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); - private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels()); + private final TermFrequencyDict ngramDict = new TermFrequencyDict(WmsaHome.getLanguageModels()); public AnchorTextExtractor(Predicate includeDomainPredicate, Predicate includeUrlPredicate, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java index c252f315..4f30e7da 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java @@ -17,6 +17,7 @@ public class DisqualifiedException extends Exception { LANGUAGE, STATUS, QUALITY, - ACCEPTABLE_ADS + ACCEPTABLE_ADS, + FORBIDDEN } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index ef88c831..70a58dc2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -86,10 +86,6 @@ public class DocumentProcessor { if (isAcceptedContentType(crawledDocument)) { var detailsWords = createDetails(crawledDomain, crawledDocument); - if (detailsWords.details().quality < minDocumentQuality) { - throw new DisqualifiedException(DisqualificationReason.QUALITY); - } - ret.details = detailsWords.details(); ret.words = detailsWords.words(); } @@ -141,11 +137,14 @@ public class DocumentProcessor { private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { - var doc = Jsoup.parse(crawledDocument.documentBody); + Document doc = Jsoup.parse(crawledDocument.documentBody); if (AcceptableAds.hasAcceptableAdsTag(doc)) { throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); } + if (doc.select("meta[name=robots]").attr("content").contains("noindex")) { + throw new DisqualifiedException(DisqualificationReason.FORBIDDEN); + } DomPruner domPruner = new DomPruner(); Document prunedDoc = doc.clone(); @@ -160,11 +159,17 @@ public class DocumentProcessor { ret.length = getLength(doc); ret.standard = getHtmlStandard(doc); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); - ret.features = featureExtractor.getFeatures(crawledDomain, doc); + ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); ret.quality = documentValuator.getQuality(ret.standard, doc, dld); ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); - var words = getWords(dld); + EdgePageWordSet words; + if (ret.quality < minDocumentQuality || dld.totalNumWords() < minDocumentLength) { + words = keywordExtractor.extractKeywordsMinimal(dld); + } + else { + words = keywordExtractor.extractKeywords(dld); + } var url = new EdgeUrl(crawledDocument.url); addMetaWords(ret, url, crawledDomain, words); @@ -195,7 +200,6 @@ public class DocumentProcessor { ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); words.append(IndexBlock.Meta, tagWords); - words.append(IndexBlock.Words_1, tagWords); } private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { @@ -255,7 +259,6 @@ public class DocumentProcessor { private void synthesizeFilenameKeyword(Set fileKeywords, EdgeUrl link) { - Path pFilename = Path.of(link.path.toLowerCase()).getFileName(); if (pFilename == null) return; @@ -273,10 +276,6 @@ public class DocumentProcessor { } private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { - if (dld.totalNumWords() < minDocumentLength) { - throw new DisqualifiedException(DisqualificationReason.LENGTH); - } - double languageAgreement = languageFilter.dictionaryAgreement(dld); if (languageAgreement < 0.1) { throw new DisqualifiedException(DisqualificationReason.LANGUAGE); @@ -292,10 +291,6 @@ public class DocumentProcessor { return htmlStandard; } - private EdgePageWordSet getWords(DocumentLanguageData dld) { - return keywordExtractor.extractKeywords(dld); - } - private String getDescription(Document doc) { return summaryExtractor.extractSummary(doc); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index b8b53f9d..549db2c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -6,13 +6,13 @@ import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.util.*; +import java.util.stream.Collectors; public class DomainProcessor { private final DocumentProcessor documentProcessor; @@ -45,6 +45,8 @@ public class DomainProcessor { ret.documents.add(processedDoc); } } + + addCommonSiteWords(ret); } else { ret.documents = Collections.emptyList(); @@ -60,6 +62,40 @@ public class DomainProcessor { return ret; } + private void addCommonSiteWords(ProcessedDomain ret) { + + if (ret.documents.size() < 25) + return; + + Map topKeywordCount = new HashMap<>(ret.documents.size()*10); + + for (var doc : ret.documents) { + if (doc.words == null) + continue; + + for (var word : doc.words.get(IndexBlock.Tfidf_Top).words) { + topKeywordCount.merge(word, -1, Integer::sum); + } + } + + if (topKeywordCount.values().stream().mapToInt(i -> i).sum() > -100) + return; + + Set topWords = topKeywordCount.entrySet().stream() + .filter(e -> e.getValue() < -10) + .sorted(Map.Entry.comparingByValue()).limit(5) + .map(Map.Entry::getKey) + .collect(Collectors.toSet()); + + if (!topWords.isEmpty()) { + for (var doc : ret.documents) { + if (doc.words != null) { + doc.words.get(IndexBlock.Site).addAll(topWords); + } + } + } + } + private double getAverageQuality(List documents) { int n = 0; double q = 0.; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java index ebe3de66..beb23977 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java @@ -4,7 +4,7 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; -import org.jsoup.select.NodeVisitor; +import org.jsoup.select.NodeFilter; import java.util.HashMap; import java.util.Map; @@ -12,100 +12,103 @@ import java.util.Map; public class DomPruner { public void prune(Document document, double pruneThreshold) { - PruningVisitor pruningVisitor = new PruningVisitor(); - document.traverse(pruningVisitor); - - pruningVisitor.data.forEach((node, data) -> { - if (data.depth <= 1) { - return; - } - if (data.signalNodeSize == 0) node.remove(); - else if (data.noiseNodeSize > 0 - && data.signalRate() < pruneThreshold - && data.treeSize > 3) { - node.remove(); - } - }); + document.filter(new PruningFilter(pruneThreshold)); } +} - private static class PruningVisitor implements NodeVisitor { +class PruningFilter implements NodeFilter { - private final Map data = new HashMap<>(); - private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0); + private final Map data = new HashMap<>(); + private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0); + private double pruneThreshold; - @Override - public void head(Node node, int depth) {} - - @Override - public void tail(Node node, int depth) { - final NodeData dataForNode; - - if (node instanceof TextNode tn) { - dataForNode = new NodeData(depth, tn.text().length(), 0); - } - else if (isSignal(node)) { - dataForNode = new NodeData(depth, 0,0); - for (var childNode : node.childNodes()) { - dataForNode.add(data.getOrDefault(childNode, dummy)); - } - } - else { - dataForNode = new NodeData(depth, 0,0); - for (var childNode : node.childNodes()) { - dataForNode.addAsNoise(data.getOrDefault(childNode, dummy)); - } - } - - - - data.put(node, dataForNode); - } - - public boolean isSignal(Node node) { - - if (node instanceof Element e) { - if ("a".equalsIgnoreCase(e.tagName())) - return false; - if ("nav".equalsIgnoreCase(e.tagName())) - return false; - if ("footer".equalsIgnoreCase(e.tagName())) - return false; - if ("header".equalsIgnoreCase(e.tagName())) - return false; - } - - return true; - } + public PruningFilter(double pruneThreshold) { + this.pruneThreshold = pruneThreshold; } - private static class NodeData { - int signalNodeSize; - int noiseNodeSize; - int treeSize = 1; - int depth; + @Override + public FilterResult head(Node node, int depth) { + return FilterResult.CONTINUE; + } - private NodeData(int depth, int signalNodeSize, int noiseNodeSize) { - this.depth = depth; - this.signalNodeSize = signalNodeSize; - this.noiseNodeSize = noiseNodeSize; + @Override + public FilterResult tail(Node node, int depth) { + final NodeData dataForNode; + + if (node instanceof TextNode tn) { + dataForNode = new NodeData(depth, tn.text().length(), 0); + } + else if (isSignal(node)) { + dataForNode = new NodeData(depth, 0,0); + for (var childNode : node.childNodes()) { + dataForNode.add(data.getOrDefault(childNode, dummy)); + } + } + else { + dataForNode = new NodeData(depth, 0,0); + for (var childNode : node.childNodes()) { + dataForNode.addAsNoise(data.getOrDefault(childNode, dummy)); + } } - public void add(NodeData other) { - signalNodeSize += other.signalNodeSize; - noiseNodeSize += other.noiseNodeSize; - treeSize += other.treeSize; + data.put(node, dataForNode); + + if (dataForNode.depth <= 1) + return FilterResult.CONTINUE; + + if (dataForNode.signalNodeSize == 0) + return FilterResult.REMOVE; + if (dataForNode.noiseNodeSize > 0 + && dataForNode.signalRate() < pruneThreshold + && dataForNode.treeSize > 3) + return FilterResult.REMOVE; + + return FilterResult.CONTINUE; + } + + public boolean isSignal(Node node) { + + if (node instanceof Element e) { + if ("a".equalsIgnoreCase(e.tagName())) + return false; + if ("nav".equalsIgnoreCase(e.tagName())) + return false; + if ("footer".equalsIgnoreCase(e.tagName())) + return false; + if ("header".equalsIgnoreCase(e.tagName())) + return false; } - public void addAsNoise(NodeData other) { - noiseNodeSize += other.noiseNodeSize + other.signalNodeSize; - treeSize += other.treeSize; - } - - - public double signalRate() { - return signalNodeSize / (double)(signalNodeSize + noiseNodeSize); - } + return true; } } + +class NodeData { + int signalNodeSize; + int noiseNodeSize; + int treeSize = 1; + int depth; + + NodeData(int depth, int signalNodeSize, int noiseNodeSize) { + this.depth = depth; + this.signalNodeSize = signalNodeSize; + this.noiseNodeSize = noiseNodeSize; + } + + public void add(NodeData other) { + signalNodeSize += other.signalNodeSize; + noiseNodeSize += other.noiseNodeSize; + treeSize += other.treeSize; + } + + public void addAsNoise(NodeData other) { + noiseNodeSize += other.noiseNodeSize + other.signalNodeSize; + treeSize += other.treeSize; + } + + public double signalRate() { + return signalNodeSize / (double)(signalNodeSize + noiseNodeSize); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java index 9f20a65f..8e48f719 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java @@ -2,7 +2,11 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -35,14 +39,20 @@ public class FeatureExtractor { "d31qbv1cthcecs.cloudfront.net", "linkedin.com"); - private AdblockSimulator adblockSimulator; + private final AdblockSimulator adblockSimulator; + private final RecipeDetector recipeDetector; + private final TextileCraftDetector textileCraftDetector; + private final WoodworkingDetector woodworkingDetector; @Inject - public FeatureExtractor(AdblockSimulator adblockSimulator) { + public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) { this.adblockSimulator = adblockSimulator; + this.recipeDetector = recipeDetector; + this.textileCraftDetector = textileCraftDetector; + this.woodworkingDetector = woodworkingDetector; } - public Set getFeatures(CrawledDomain domain, Document doc) { + public Set getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) { final Set features = new HashSet<>(); final Elements scriptTags = doc.getElementsByTag("script"); @@ -81,9 +91,14 @@ public class FeatureExtractor { } } - if (!domain.cookies.isEmpty()) { + if (!domain.cookies.isEmpty()) features.add(HtmlFeature.COOKIES); - } + + if (recipeDetector.testP(dld) > 0.5) + features.add(HtmlFeature.CATEGORY_FOOD); + // these should be mutually exclusive + else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) > 0.3) + features.add(HtmlFeature.CATEGORY_CRAFTS); return features; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java index 5744221d..d6d2d8f4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java @@ -12,6 +12,8 @@ public enum HtmlFeature { CATEGORY_FOOD("category:food"), ADVERTISEMENT("special:ads"), + + CATEGORY_CRAFTS("category:crafts"), ; private final String keyword; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java index 17f8d992..74122799 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic.topic; import ca.rmen.porterstemmer.PorterStemmer; +import com.google.inject.Inject; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import java.util.HashMap; @@ -14,6 +15,7 @@ public class RecipeDetector { private final Map termValues = new HashMap<>(); + @Inject public RecipeDetector() { PorterStemmer ps = new PorterStemmer(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java index 1146c620..1df3b8ee 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic.topic; import ca.rmen.porterstemmer.PorterStemmer; +import com.google.inject.Inject; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import java.util.HashMap; @@ -14,6 +15,7 @@ public class TextileCraftDetector { private final Map termValues = new HashMap<>(); + @Inject public TextileCraftDetector() { PorterStemmer ps = new PorterStemmer(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java index bb4a0cd0..e58320f6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic.topic; import ca.rmen.porterstemmer.PorterStemmer; +import com.google.inject.Inject; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import java.util.HashMap; @@ -14,6 +15,7 @@ public class WoodworkingDetector { private final Map termValues = new HashMap<>(); + @Inject public WoodworkingDetector() { PorterStemmer ps = new PorterStemmer(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java index a9271453..80b37191 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java @@ -15,7 +15,7 @@ import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Consumer; -public class KeywordLexiconJournalFile { +public class KeywordLexiconJournalFile implements AutoCloseable { private final RandomAccessFile journalFileRAF; private final File journalFile; private final Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index f906265a..1d0915aa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -4,22 +4,24 @@ public enum IndexBlock { TitleKeywords(0, 0), Title(1, 1), - Link(2, 1.25), + Link(2, 1.15), - Subjects(3, 0.5), - NamesWords(4, 5), + Subjects(3, 3.0), + NamesWords(4, 3.0), Artifacts(5, 10), Meta(6, 7), - Tfidf_Top(7, 2), - Tfidf_Middle(8, 2.5), - Tfidf_Lower(9, 5.0), + Tfidf_Top(7, 0.5), + Tfidf_Middle(8, 1.25), + Tfidf_Lower(9, 1.5), Words_1(10, 3.0), Words_2(11, 3.5), Words_4(12, 4.0), Words_8(13, 4.5), Words_16Plus(14, 7.0), + + Site(15, 1.2), ; public final int id; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 3816ec39..56df6ddf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -29,7 +29,6 @@ public class SearchIndexReader implements AutoCloseable { IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, - IndexBlock.NamesWords, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, @@ -62,15 +61,14 @@ public class SearchIndexReader implements AutoCloseable { queryBuilders = new EnumMap<>(IndexBlock.class); underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class); - queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1), words1)); - queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words2), words1)); - queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words4), words1)); - queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words8), words1)); - queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1)); + queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1), words1)); + queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words2), words1)); + queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words4), words1)); + queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words8), words1)); + queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1)); - underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); - underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); - underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, namesIndex, topicIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(linkIndex, namesIndex, topIndex, midIndex, lowIndex, topicIndex, metaIndex), words1)); } @SafeVarargs diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java index 10ee3542..7dfe0f6a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java @@ -19,7 +19,8 @@ public class EdgePageWordSet { public EdgePageWords get(IndexBlock block) { var words = wordSets.get(block); if (words == null) { - return new EdgePageWords(block); + words = new EdgePageWords(block); + wordSets.put(block, words); } return words; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java index 4a2086ca..ce1aea9e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java @@ -1,7 +1,7 @@ package nu.marginalia.wmsa.edge.search.query; import com.google.inject.Inject; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,11 +13,11 @@ import java.util.stream.Collectors; public class EnglishDictionary { private final Set englishWords = new HashSet<>(); - private final NGramDict dict; + private final TermFrequencyDict dict; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public EnglishDictionary(NGramDict dict) { + public EnglishDictionary(TermFrequencyDict dict) { this.dict = dict; try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"), "Could not load word frequency table"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index 3badd593..a602f620 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -4,7 +4,8 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; @@ -22,20 +23,22 @@ import java.util.*; public class QueryFactory { private final LanguageModels lm; - private final NGramDict dict; + private final TermFrequencyDict dict; private final EnglishDictionary englishDictionary; + private final NGramBloomFilter nGramBloomFilter; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public QueryFactory(LanguageModels lm, NGramDict dict, EnglishDictionary englishDictionary) { + public QueryFactory(LanguageModels lm, TermFrequencyDict dict, EnglishDictionary englishDictionary, NGramBloomFilter nGramBloomFilter) { this.lm = lm; this.dict = dict; this.englishDictionary = englishDictionary; + this.nGramBloomFilter = nGramBloomFilter; } public QueryParser getParser() { - return new QueryParser(englishDictionary, new QueryVariants(lm ,dict, englishDictionary)); + return new QueryParser(englishDictionary, new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary)); } public EdgeSearchQuery createQuery(EdgeUserSearchParameters params) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java index 2b509397..1112b7a9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java @@ -10,7 +10,8 @@ import nu.marginalia.util.language.processing.KeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordSpan; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import opennlp.tools.stemmer.PorterStemmer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,14 +25,18 @@ public class QueryVariants { private final Logger logger = LoggerFactory.getLogger(getClass()); private final KeywordExtractor keywordExtractor; private final SentenceExtractor sentenceExtractor; - private final NGramDict dict; + private final TermFrequencyDict dict; private final PorterStemmer ps = new PorterStemmer(); - private final static int MAX_NGRAM_LENGTH = 4; + private final NGramBloomFilter nGramBloomFilter; private final EnglishDictionary englishDictionary; @Inject - public QueryVariants(LanguageModels lm, NGramDict dict, EnglishDictionary englishDictionary) { + public QueryVariants(LanguageModels lm, + TermFrequencyDict dict, + NGramBloomFilter nGramBloomFilter, + EnglishDictionary englishDictionary) { + this.nGramBloomFilter = nGramBloomFilter; this.englishDictionary = englishDictionary; this.keywordExtractor = new KeywordExtractor(); this.sentenceExtractor = new SentenceExtractor(lm); @@ -154,11 +159,11 @@ public class QueryVariants { double q = 0; for (var word : lst) { String[] parts = underscore.split(word); - StringJoiner combined = new StringJoiner("_"); + double qp = 0; for (String part : parts) { - combined.add(ps.stem(part)); + qp += 1./(1+ dict.getTermFreq(part)); } - q += Math.log(1 + dict.getTermFreqStemmed(combined.toString())); + q += 1.0 / qp; } ret.add(new QueryVariant(lst, q)); } @@ -215,8 +220,8 @@ public class QueryVariants { while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { ws = wordMatcher.start()+1; ss = stemmedMatcher.start()+1; - if (dict.getTermFreqStemmed(splitAtNumBoundaryAndStem(span.word, stemmedMatcher.start(), "_")) > 0 - || dict.getTermFreqStemmed(splitAtNumBoundaryAndStem(span.word, stemmedMatcher.start(), "-")) > 0) + if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) + || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) { String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); asTokens2.add(combined); @@ -242,7 +247,7 @@ public class QueryVariants { for (var span : ls) { var matcher = dashBoundary.matcher(span.word); - if (matcher.find() && dict.getTermFreqStemmed(ps.stem(dashBoundary.matcher(span.word).replaceAll(""))) > 0) { + if (matcher.find() && nGramBloomFilter.isKnownNGram(ps.stem(dashBoundary.matcher(span.word).replaceAll("")))) { dash = true; String combined = dashBoundary.matcher(span.word).replaceAll(""); asTokens2.add(combined); @@ -262,10 +267,6 @@ public class QueryVariants { return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); } - private String splitAtNumBoundaryAndStem(String in, int splitPoint, String joiner) { - return ps.stem(in.substring(0, splitPoint+1)) + joiner + ps.stem(in.substring(splitPoint+1)); - } - private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { List> goodSpans = new ArrayList<>(); for (int i = 0; i < sentence.length(); i++) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java index e82153ed..a1c3bc58 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.search.results; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; @@ -12,7 +12,7 @@ import java.util.regex.Pattern; @Singleton public class SearchResultValuator { - private final NGramDict dict; + private final TermFrequencyDict dict; private static final Pattern separator = Pattern.compile("_"); @@ -20,7 +20,7 @@ public class SearchResultValuator { private static final int AVG_LENGTH = 1400; @Inject - public SearchResultValuator(NGramDict dict) { + public SearchResultValuator(TermFrequencyDict dict) { this.dict = dict; } diff --git a/marginalia_nu/src/main/resources/templates/edge/index.hdb b/marginalia_nu/src/main/resources/templates/edge/index.hdb index c725054d..14e518ba 100644 --- a/marginalia_nu/src/main/resources/templates/edge/index.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/index.hdb @@ -2,7 +2,7 @@ - Marginalia Search} + Marginalia Search diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java index b5ce6bdb..f8554868 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java @@ -1,7 +1,7 @@ package nu.marginalia.util; -import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.configuration.WmsaHome; import java.nio.file.Files; import java.nio.file.Path; @@ -26,7 +26,7 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams-generous-emstr.bin"), + languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-generous-emstr.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java index c24d5e1f..2acf9165 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java @@ -1,9 +1,9 @@ package nu.marginalia.wmsa.edge.assistant.suggest; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; -import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -17,7 +17,7 @@ class SuggestionsTest { public static void setUp() { LanguageModels lm = TestLanguageModels.getLanguageModels(); suggestions = new Suggestions(Path.of("/home/vlofgren/Work/sql-titles-clean"), - new SpellChecker(), new NGramDict(lm)); + new SpellChecker(), new TermFrequencyDict(lm)); } @Test diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java index ea742a93..792a4221 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java @@ -10,7 +10,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeDomain; import org.jsoup.Jsoup; @@ -45,7 +45,7 @@ class SentenceExtractorTest { System.out.println("Running"); - var dict = new NGramDict(lm); + var dict = new TermFrequencyDict(lm); SentenceExtractor se = new SentenceExtractor(lm); KeywordExtractor keywordExtractor = new KeywordExtractor(); @@ -85,7 +85,7 @@ class SentenceExtractorTest { System.out.println("Running"); - var dict = new NGramDict(lm); + var dict = new TermFrequencyDict(lm); DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); @@ -110,7 +110,7 @@ class SentenceExtractorTest { System.out.println("Running"); - var dict = new NGramDict(lm); + var dict = new TermFrequencyDict(lm); DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); @@ -154,7 +154,7 @@ class SentenceExtractorTest { public void testSE() { var result = newSe.extractSentences(Jsoup.parse(new URL("https://memex.marginalia.nu/log/26-personalized-pagerank.gmi"), 10000)); - var dict = new NGramDict(lm); + var dict = new TermFrequencyDict(lm); System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result)); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java index 0f36b0a9..8f250456 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java @@ -1,10 +1,10 @@ package nu.marginalia.wmsa.edge.integration.arxiv; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -26,7 +26,7 @@ class ArxivParserTest { @Test void extractKeywords() throws IOException { - var dict = new NGramDict(lm); + var dict = new TermFrequencyDict(lm); DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java index 0feda3dd..b0c98dc9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java @@ -1,11 +1,11 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow; +import nu.marginalia.util.ParallelPipe; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; -import nu.marginalia.util.ParallelPipe; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; import nu.marginalia.wmsa.edge.model.EdgeDomain; @@ -20,7 +20,7 @@ public class StackOverflowPostsTest { @Test @Disabled("this is stupidly slow") public void test() throws ParserConfigurationException, SAXException, InterruptedException { - var documentKeywordExtractor = new DocumentKeywordExtractor(new NGramDict(lm)); + var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm)); ThreadLocal processor = ThreadLocal.withInitial(() -> { return new StackOverflowPostProcessor(new SentenceExtractor(lm), documentKeywordExtractor); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java index e4787094..47fa3b5d 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java @@ -1,13 +1,13 @@ package nu.marginalia.wmsa.edge.integration.wikipedia; import lombok.SneakyThrows; +import nu.marginalia.util.ParallelPipe; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.util.language.DocumentDebugger; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; -import nu.marginalia.util.ParallelPipe; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; import nu.marginalia.wmsa.edge.model.EdgeDomain; @@ -21,7 +21,7 @@ public class WikipediaTest { @Test @SneakyThrows public void test() { - var documentKeywordExtractor = new DocumentKeywordExtractor(new NGramDict(lm)); + var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm)); ThreadLocal processor = ThreadLocal.withInitial(() -> { return new WikipediaProcessor(new SentenceExtractor(lm), documentKeywordExtractor); }); @@ -48,7 +48,7 @@ public class WikipediaTest { @Test @SneakyThrows public void test2() { - var documentKeywordExtractor = new DocumentKeywordExtractor(new NGramDict(lm)); + var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm)); var debugger = new DocumentDebugger(lm); ThreadLocal processor = ThreadLocal.withInitial(() -> { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java index ba57d530..4e1bc2b6 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java @@ -1,31 +1,35 @@ package nu.marginalia.wmsa.edge.search.query; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.io.IOException; import java.util.List; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; class BodyQueryParserTest { private QueryParser parser; - private static NGramDict dict; + private static TermFrequencyDict dict; private static EnglishDictionary englishDictionary; + private static NGramBloomFilter nGramBloomFilter; private static final LanguageModels lm = TestLanguageModels.getLanguageModels(); @BeforeAll - public static void init() { - dict = new NGramDict(lm); + public static void init() throws IOException { + dict = new TermFrequencyDict(lm); + nGramBloomFilter = new NGramBloomFilter(lm); englishDictionary = new EnglishDictionary(dict); } @BeforeEach public void setUp() { - parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, englishDictionary)); + parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary)); } @Test diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionaryTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionaryTest.java index 79c2bd1b..f6a5b66f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionaryTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionaryTest.java @@ -1,8 +1,8 @@ package nu.marginalia.wmsa.edge.search.query; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.Test; class EnglishDictionaryTest { @@ -11,7 +11,7 @@ class EnglishDictionaryTest { void getWordVariants() { LanguageModels lm = TestLanguageModels.getLanguageModels(); - var dict = new NGramDict(lm); + var dict = new TermFrequencyDict(lm); new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java index 1c971624..5d2a2f83 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java @@ -1,25 +1,29 @@ package nu.marginalia.wmsa.edge.search.query; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.io.IOException; import java.util.stream.Collectors; class QueryParserTest { private QueryParser parser; - private static NGramDict dict; + private static TermFrequencyDict dict; private static EnglishDictionary englishDictionary; + private static NGramBloomFilter nGramBloomFilter; private static final LanguageModels lm = TestLanguageModels.getLanguageModels(); @BeforeEach - public void setUp() { - dict = new NGramDict(lm); + public void setUp() throws IOException { + dict = new TermFrequencyDict(lm); + nGramBloomFilter = new NGramBloomFilter(lm); englishDictionary = new EnglishDictionary(dict); - parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, englishDictionary)); + parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary)); } @Test diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index 35bba6c5..597341bc 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -3,23 +3,27 @@ package nu.marginalia.wmsa.edge.search.query; import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.SentenceExtractor; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.io.IOException; + class QueryVariantsTest { QueryVariants variants; QueryParser parser; SentenceExtractor se; @BeforeEach - public void setUp() { + public void setUp() throws IOException { LanguageModels lm = TestLanguageModels.getLanguageModels(); se = new SentenceExtractor(lm); - var dict = new NGramDict(lm); - variants = new QueryVariants(lm, dict, new EnglishDictionary(dict)); + var dict = new TermFrequencyDict(lm); + var ngrams = new NGramBloomFilter(lm); + variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(dict)); parser = new QueryParser(new EnglishDictionary(dict), variants); } From 5f993c72ddb4c2be9007bb27ec2106f31cdc3469 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 2 Sep 2022 09:34:20 +0200 Subject: [PATCH 08/19] Tweaks for search result relevance --- .../util/language/WordPatterns.java | 4 +- .../language/processing/AsciiFlattener.java | 4 +- .../processing/DocumentKeywordExtractor.java | 27 +-- .../language/processing/KeywordCounter.java | 30 +++- .../language/processing/KeywordExtractor.java | 77 +-------- .../util/language/processing/NameCounter.java | 3 + .../wmsa/edge/converting/ConverterMain.java | 9 +- .../converting/CrawledInstructionWriter.java | 62 ++++++- .../wmsa/edge/converting/loader/Loader.java | 2 +- .../converting/loader/SqlLoadDomainLinks.java | 9 +- .../converting/model/ProcessedDocument.java | 1 + .../processor/DocumentProcessor.java | 45 ++++- .../converting/processor/DomainProcessor.java | 102 ++++++++---- .../logic/CommonKeywordExtractor.java | 71 ++++++++ .../processor/logic/HtmlFeature.java | 2 + .../edge/crawling/CrawledDomainReader.java | 13 +- .../crawling/model/CrawlerDocumentStatus.java | 3 +- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 8 +- .../wmsa/edge/index/EdgeIndexService.java | 114 ++++--------- .../wmsa/edge/index/model/IndexBlock.java | 12 +- .../edge/index/reader/SearchIndexReader.java | 9 +- .../index/reader/query/IndexQueryBuilder.java | 2 +- .../wmsa/edge/model/EdgeCrawlPlan.java | 15 ++ .../model/search/EdgeSearchResultItem.java | 6 +- .../search/EdgeSearchResultKeywordScore.java | 10 +- .../model/search/EdgeSearchResultSet.java | 2 +- .../edge/model/search/EdgeSearchResults.java | 16 +- .../edge/model/search/EdgeUrlDetails.java | 3 +- .../wmsa/edge/search/EdgeSearchOperator.java | 20 ++- .../wmsa/edge/search/EdgeSearchProfile.java | 14 +- .../edge/search/query/EnglishDictionary.java | 11 +- .../wmsa/edge/search/query/QueryFactory.java | 2 +- .../search/results/SearchResultDecorator.java | 18 +- .../search/results/SearchResultValuator.java | 97 +++++++---- .../edge/tools/ConverterLogicTestTool.java | 59 +++++-- .../main/resources/dictionary/en-stopwords | 9 - .../loader/SqlLoadDomainLinksTest.java | 2 +- .../edge/crawling/SentenceExtractorTest.java | 39 +---- .../index/service/EdgeIndexClientTest.java | 156 ------------------ .../search/query/BodyQueryParserTest.java | 2 +- .../search/query/EnglishDictionaryTest.java | 17 -- .../edge/search/query/QueryParserTest.java | 2 +- .../edge/search/query/QueryVariantsTest.java | 4 +- 43 files changed, 539 insertions(+), 574 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionaryTest.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java index b7a588db..e09c7709 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java @@ -63,7 +63,7 @@ public class WordPatterns { if (word.isBlank()) { return false; } - if (hasMoreThanTwo(word, '-', 2)) { + if (hasMoreThanTwo(word, '-', 4)) { return false; } if (hasMoreThanTwo(word, '+', 2)) { @@ -80,7 +80,7 @@ public class WordPatterns { if (Character.isDigit(word.charAt(i))) { numDigits++; } - if (numDigits > 6) + if (numDigits > 16) return false; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java index 0b144f2d..ca53c7d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java @@ -5,8 +5,8 @@ import java.util.regex.Pattern; public class AsciiFlattener { - private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:]+"); - private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:]+$"); + private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+"); + private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$"); private static final Predicate plainAscii = plainAsciiPattern.asMatchPredicate(); public static String flattenUnicode(String s) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 0eed6a2e..2626e2e8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -1,6 +1,5 @@ package nu.marginalia.util.language.processing; -import com.google.common.collect.Sets; import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.WordRep; @@ -12,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import javax.inject.Inject; import java.util.*; import java.util.stream.Collectors; -import java.util.stream.Stream; public class DocumentKeywordExtractor { @@ -42,7 +40,7 @@ public class DocumentKeywordExtractor { List titleWords = extractTitleWords(documentLanguageData); KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData); - List wordsNamesAll = nameCounter.count(documentLanguageData, 1); + List wordsNamesAll = nameCounter.count(documentLanguageData, 2); List subjects = subjectCounter.count(documentLanguageData); List midKeywords = new ArrayList<>(wordsTfIdf.mid()); @@ -190,30 +188,7 @@ public class DocumentKeywordExtractor { .collect(Collectors.toList()); } - private Collection joinWordLists(List... words) { - int size = 0; - for (var lst : words) { - size += lst.size(); - } - if (size == 0) - return Collections.emptyList(); - - final LinkedHashSet ret = new LinkedHashSet<>(size); - for (var lst : words) { - ret.addAll(lst); - } - return ret; - } - - public EdgePageWords createWords(IndexBlock block, Collection words) { return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet())); } - - private Set overlappingStems(Collection wordsA, Collection wordsB) { - Set stemmedA = wordsA.stream().map(WordRep::getStemmed).collect(Collectors.toSet()); - Set stemmedB = wordsB.stream().map(WordRep::getStemmed).collect(Collectors.toSet()); - Set stemmedIntersect = Sets.intersection(stemmedA, stemmedB); - return Stream.concat(wordsA.stream(), wordsB.stream()).filter(w -> stemmedIntersect.contains(w.getStemmed())).collect(Collectors.toSet()); - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java index 4217b743..5a8af220 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java @@ -1,5 +1,6 @@ package nu.marginalia.util.language.processing; +import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; @@ -22,17 +23,20 @@ public class KeywordCounter { } public WordHistogram countHisto(DocumentLanguageData dld) { - HashMap counts = new HashMap<>(1000); + HashMap counts = new HashMap<>(1000); HashMap> instances = new HashMap<>(1000); for (var sent : dld.sentences) { var keywords = keywordExtractor.getKeywordsFromSentence(sent); for (var span : keywords) { + if (span.size() == 1 && + WordPatterns.isStopWord(sent.words[span.start])) + continue; String stemmed = sent.constructStemmedWordFromSpan(span); - counts.merge(stemmed, 1., Double::sum); + counts.merge(stemmed, 1, Integer::sum); instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span)); } } @@ -43,15 +47,23 @@ public class KeywordCounter { Set h10 = new HashSet<>(); Set h15 = new HashSet<>(); + int doubleWordCount = 0; + for (var entry : counts.entrySet()) { double value = getTermValue(entry, maxC); + + double avgCnt = entry.getValue(); + String wordStemmed = entry.getKey(); + Set histogram; - if (value < -3) histogram = h15; - else if (value < -2) histogram = h10; - else if (value < -1) histogram = h5; + if (value < -3 && avgCnt>1) histogram = h15; + else if (value < -1.75 && avgCnt>1) histogram = h10; + else if (value < -1 && + (!wordStemmed.contains("_") || doubleWordCount++ < 50)) + histogram = h5; else continue; - histogram.addAll(instances.get(entry.getKey())); + histogram.addAll(instances.get(wordStemmed)); } return new WordHistogram(h5, h10, h15); @@ -59,7 +71,7 @@ public class KeywordCounter { private static final Pattern separator = Pattern.compile("_"); - public double getTermValue(Map.Entry e, double maxValue) { + public double getTermValue(Map.Entry e, double maxValue) { String[] parts = separator.split(e.getKey()); double totalValue = 0.; for (String part : parts) { @@ -71,9 +83,9 @@ public class KeywordCounter { double value(String key, double value, double maxValue) { double freq = dict.getTermFreqStemmed(key); if (freq < 1) { - freq = 10; + freq = 1; } - return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/docCount); + return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount); } public record WordHistogram(Set lower, Set mid, Set top) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java index ea68c63e..08c586e2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java @@ -10,84 +10,9 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; -import java.util.function.Function; -import java.util.stream.IntStream; -import java.util.stream.Stream; public class KeywordExtractor { - public boolean isLegacy() { - return legacy; - } - - public void setLegacy(boolean legacy) { - this.legacy = legacy; - } - - private boolean legacy; - - public WordSpan[] getNameLikes(DocumentSentence sentence) { - var direct = IntStream.range(0, sentence.length()) - .filter(i -> sentence.posTags[i].startsWith("N")) - .mapToObj(i -> new WordSpan(i, i+1)) - ; - var two = IntStream.range(1, sentence.length()) - .filter(i -> sentence.separators[i-1] == WordSeparator.SPACE) - .filter(i -> isName(i, sentence, Collections.emptySet())) - .filter(i -> isName(i -1, sentence, Collections.emptySet())) - .mapToObj(i -> new WordSpan(i-1, i+1)) - ; - - var a_in_b = IntStream.range(2, sentence.length()) - .filter(i -> sentence.separators[i-1] == WordSeparator.SPACE) - .filter(i -> isProperNoun(i, sentence)) - .filter(i -> isJoiner(sentence, i-1)) - .filter(i -> isProperNoun(i-2, sentence)) - .mapToObj(i -> new WordSpan(i-2, i+1)) - ; - - var a_in_det_b = IntStream.range(3, sentence.length()) - .filter(i -> sentence.separators[i-1] == WordSeparator.SPACE - && sentence.separators[i-2] == WordSeparator.SPACE) - .filter(i -> isProperNoun(i, sentence)) - .filter(i -> isJoiner(sentence, i-1)) - .filter(i -> sentence.posTags[i-2].equals("DT")) - .filter(i -> isProperNoun(i-3, sentence)) - .mapToObj(i -> new WordSpan(i-3, i+1)) - ; - var a_in_in_b = IntStream.range(3, sentence.length()) - .filter(i -> sentence.separators[i-1] == WordSeparator.SPACE - && sentence.separators[i-2] == WordSeparator.SPACE) - .filter(i -> isProperNoun(i, sentence)) - .filter(i -> isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) - .filter(i -> isJoiner(sentence, i-2) || isProperNoun(i-2, sentence)) - .filter(i -> isProperNoun(i-3, sentence)) - .mapToObj(i -> new WordSpan(i-3, i+1)) - ; - var three = IntStream.range(2, sentence.length()) - .filter(i -> sentence.separators[i-1] == WordSeparator.SPACE - && sentence.separators[i-2] == WordSeparator.SPACE) - .filter(i -> isName(i, sentence, Collections.emptySet())) - .filter(i -> isName(i-1, sentence, Collections.emptySet())) - .filter(i -> isName(i-2, sentence, Collections.emptySet())) - .mapToObj(i -> new WordSpan(i-2, i+1)) - ; - var four = IntStream.range(3, sentence.length()) - .filter(i -> sentence.separators[i-1] == WordSeparator.SPACE - && sentence.separators[i-2] == WordSeparator.SPACE - && sentence.separators[i-3] == WordSeparator.SPACE) - .filter(i -> isName(i, sentence, Collections.emptySet())) - .filter(i -> isName(i - 1, sentence, Collections.emptySet())) - .filter(i -> isName(i - 2, sentence, Collections.emptySet())) - .filter(i -> isName(i - 3, sentence, Collections.emptySet())) - .mapToObj(i -> new WordSpan(i-3, i+1)) - ; - - return Stream.of(direct, two, a_in_b, a_in_in_b, a_in_det_b, three, four).flatMap(Function.identity()) - .toArray(WordSpan[]::new); - } - - public WordSpan[] getNames(DocumentSentence sentence) { List spans = new ArrayList<>(sentence.length()); @@ -214,7 +139,7 @@ public class KeywordExtractor { } String word = sentence.constructWordFromSpan(w); - if (word.isBlank() || WordPatterns.isStopWord(word)) return false; + if (word.isBlank() || !WordPatterns.filter(word)) return false; if (sentence.posTags[w.start].equals("CC")) return false; if (sentence.posTags[w.end-1].equals("IN")) return false; if (sentence.posTags[w.end-1].equals("DT")) return false; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java index c52871bc..142f1477 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java @@ -22,6 +22,9 @@ public class NameCounter { DocumentSentence sent = dld.sentences[i]; var keywords = keywordExtractor.getNames(sent); for (var span : keywords) { + if (span.size() <= 1) + continue; + var stemmed = sent.constructStemmedWordFromSpan(span); counts.merge(stemmed, 1., Double::sum); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java index 93814b46..11251a67 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java @@ -52,7 +52,7 @@ public class ConverterMain { logger.info("Starting pipe"); try (WorkLog processLog = plan.createProcessWorkLog()) { - var pipe = new ParallelPipe("Crawler", 48, 4, 2) { + var pipe = new ParallelPipe("Crawler", 16, 4, 2) { @Override protected ProcessingInstructions onProcess(CrawledDomain domainData) { @@ -73,12 +73,7 @@ public class ConverterMain { }; - plan.forEachCrawledDomain(domain -> { - if (!processLog.isJobFinished(domain.id)) { - logger.info("{} - {}", domain.domain, domain.id); - pipe.accept(domain); - } - }); + plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept); pipe.join(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/CrawledInstructionWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/CrawledInstructionWriter.java index 3140808e..00f03778 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/CrawledInstructionWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/CrawledInstructionWriter.java @@ -3,6 +3,14 @@ package nu.marginalia.wmsa.edge.converting; import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; +import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,7 +44,9 @@ public class CrawledInstructionWriter { } try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) { - logger.info("Writing {} - {}", id, instructionList.size()); + + SummarizingInterpreter summary = new SummarizingInterpreter(instructionList); + logger.info("Writing {} - {} - {}", id, instructionList.size(), summary); for (var instr : instructionList) { outputStream.append(instr.tag().name()); @@ -59,4 +69,54 @@ public class CrawledInstructionWriter { } return destDir.resolve(id + ".pzstd"); } + + private static class SummarizingInterpreter implements Interpreter { + + private SummarizingInterpreter(List instructions) { + for (var i : instructions) { + i.apply(this); + } + } + + private String domainName; + private int ok = 0; + private int error = 0; + + public String toString() { + return String.format("%s - %d %d", domainName, ok, error); + } + + @Override + public void loadUrl(EdgeUrl[] url) {} + + @Override + public void loadDomain(EdgeDomain[] domain) {} + + @Override + public void loadRssFeed(EdgeUrl[] rssFeed) {} + + @Override + public void loadDomainLink(DomainLink[] links) {} + + @Override + public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) { + this.domainName = domain.toString(); + } + + @Override + public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) { + ok++; + } + + @Override + public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) { + error++; + } + + @Override + public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {} + + @Override + public void loadDomainRedirect(DomainLink link) {} + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java index 49a39457..00ed50d3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java @@ -72,7 +72,7 @@ public class Loader implements Interpreter { @Override public void loadDomainLink(DomainLink[] links) { logger.debug("loadDomainLink({})", links, null); - sqlLoadDomainLinks.load(links); + sqlLoadDomainLinks.load(data, links); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java index 6750bd33..268d8b28 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java @@ -40,13 +40,20 @@ public class SqlLoadDomainLinks { } } - public void load(DomainLink[] links) { + public void load(LoaderData data, DomainLink[] links) { try (var connection = dataSource.getConnection(); + var nukeExistingLinksForDomain = + connection.prepareStatement(""" + DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=? + """); var stmt = connection.prepareCall("CALL INSERT_LINK(?,?)")) { + nukeExistingLinksForDomain.setInt(1, data.getDomainId(links[0].from())); + nukeExistingLinksForDomain.executeUpdate(); + for (DomainLink link : links) { stmt.setString(1, link.from().toString()); stmt.setString(2, link.to().toString()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java index e73b6a8f..1b8eb155 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java @@ -15,6 +15,7 @@ public class ProcessedDocument { public EdgePageWordSet words; public EdgeUrlState state; + public String stateReason; public OptionalDouble quality() { if (details != null) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 70a58dc2..60d1071d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -70,11 +70,22 @@ public class DocumentProcessor { this.summaryExtractor = summaryExtractor; } + public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) { + ProcessedDocument ret = new ProcessedDocument(); + + try { + ret.state = EdgeUrlState.DISQUALIFIED; + ret.url = getDocumentUrl(crawledDocument); + } + catch (Exception ex) {} + + return ret; + } public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) { ProcessedDocument ret = new ProcessedDocument(); try { - ret.url = new EdgeUrl(crawledDocument.url); + ret.url = getDocumentUrl(crawledDocument); ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); if (ret.state == EdgeUrlState.OK) { @@ -99,17 +110,31 @@ public class DocumentProcessor { } catch (DisqualifiedException ex) { ret.state = EdgeUrlState.DISQUALIFIED; + ret.stateReason = ex.reason.toString(); logger.debug("Disqualified {}: {}", ret.url, ex.reason); } catch (Exception ex) { ret.state = EdgeUrlState.DISQUALIFIED; - logger.info("Failed to convert " + ret.url, ex); + logger.info("Failed to convert " + crawledDocument.url, ex); ex.printStackTrace(); } return ret; } + private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument) + throws URISyntaxException + { + if (crawledDocument.canonicalUrl != null) { + try { + return new EdgeUrl(crawledDocument.canonicalUrl); + } + catch (URISyntaxException ex) { /* fallthrough */ } + } + + return new EdgeUrl(crawledDocument.url); + } + public static boolean isAcceptedContentType(CrawledDocument crawledDocument) { if (crawledDocument.contentType == null) { return false; @@ -155,20 +180,26 @@ public class DocumentProcessor { var ret = new ProcessedDocumentDetails(); - ret.description = getDescription(doc); + ret.length = getLength(doc); ret.standard = getHtmlStandard(doc); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); - ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); + ret.quality = documentValuator.getQuality(ret.standard, doc, dld); ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); + final boolean doSimpleProcessing = ret.quality < minDocumentQuality; + EdgePageWordSet words; - if (ret.quality < minDocumentQuality || dld.totalNumWords() < minDocumentLength) { + if (doSimpleProcessing) { + ret.features = Set.of(HtmlFeature.UNKNOWN); words = keywordExtractor.extractKeywordsMinimal(dld); + ret.description = ""; } else { + ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); words = keywordExtractor.extractKeywords(dld); + ret.description = getDescription(doc); } var url = new EdgeUrl(crawledDocument.url); @@ -276,6 +307,10 @@ public class DocumentProcessor { } private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { + if (dld.totalNumWords() < minDocumentLength) { + throw new DisqualifiedException(DisqualificationReason.LENGTH); + } + double languageAgreement = languageFilter.dictionaryAgreement(dld); if (languageAgreement < 0.1) { throw new DisqualifiedException(DisqualificationReason.LANGUAGE); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index 549db2c9..c12f992a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -1,23 +1,27 @@ package nu.marginalia.wmsa.edge.converting.processor; +import com.google.common.base.Strings; import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; +import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; import java.util.*; -import java.util.stream.Collectors; public class DomainProcessor { + private static final CommonKeywordExtractor commonKeywordExtractor = new CommonKeywordExtractor(); + private final DocumentProcessor documentProcessor; private final Double minAvgDocumentQuality; + @Inject public DomainProcessor(DocumentProcessor documentProcessor, @Named("min-avg-document-quality") Double minAvgDocumentQuality @@ -39,61 +43,71 @@ public class DomainProcessor { if (crawledDomain.doc != null) { ret.documents = new ArrayList<>(crawledDomain.doc.size()); + fixBadCanonicalTags(crawledDomain.doc); + + DocumentDisqualifier disqualifier = new DocumentDisqualifier(); for (var doc : crawledDomain.doc) { - var processedDoc = documentProcessor.process(doc, crawledDomain); - if (processedDoc.url != null) { - ret.documents.add(processedDoc); + if (disqualifier.isQualified()) { + var processedDoc = documentProcessor.process(doc, crawledDomain); + + if (processedDoc.url != null) { + ret.documents.add(processedDoc); + processedDoc.quality().ifPresent(disqualifier::offer); + } + else if ("LANGUAGE".equals(processedDoc.stateReason)) { + disqualifier.offer(-100); + } + } + else { // Short-circuit processing if quality is too low + var stub = documentProcessor.makeDisqualifiedStub(doc); + if (stub.url != null) { + ret.documents.add(stub); + } } } - addCommonSiteWords(ret); + Set commonSiteWords = new HashSet<>(10); + + commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects)); + commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title)); + + if (!commonSiteWords.isEmpty()) { + for (var doc : ret.documents) { + if (doc.words != null) { + doc.words.get(IndexBlock.Site).addAll(commonSiteWords); + } + } + } } else { ret.documents = Collections.emptyList(); } - double averageQuality = getAverageQuality(ret.documents); - if (averageQuality < minAvgDocumentQuality) { - ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED); - } - ret.state = getState(crawledDomain.crawlerStatus); return ret; } - private void addCommonSiteWords(ProcessedDomain ret) { + private void fixBadCanonicalTags(List docs) { + Map> seenCanonicals = new HashMap<>(); - if (ret.documents.size() < 25) - return; + // Sometimes sites set a blanket canonical link to their root page + // this removes such links from consideration - Map topKeywordCount = new HashMap<>(ret.documents.size()*10); - - for (var doc : ret.documents) { - if (doc.words == null) - continue; - - for (var word : doc.words.get(IndexBlock.Tfidf_Top).words) { - topKeywordCount.merge(word, -1, Integer::sum); + for (var document : docs) { + if (!Strings.isNullOrEmpty(document.canonicalUrl) && !Objects.equals(document.canonicalUrl, document.url)) { + seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash); } } - if (topKeywordCount.values().stream().mapToInt(i -> i).sum() > -100) - return; - - Set topWords = topKeywordCount.entrySet().stream() - .filter(e -> e.getValue() < -10) - .sorted(Map.Entry.comparingByValue()).limit(5) - .map(Map.Entry::getKey) - .collect(Collectors.toSet()); - - if (!topWords.isEmpty()) { - for (var doc : ret.documents) { - if (doc.words != null) { - doc.words.get(IndexBlock.Site).addAll(topWords); - } + for (var document : docs) { + if (!Strings.isNullOrEmpty(document.canonicalUrl) + && !Objects.equals(document.canonicalUrl, document.url) + && seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) { + document.canonicalUrl = document.url; } } + } private double getAverageQuality(List documents) { @@ -120,4 +134,20 @@ public class DomainProcessor { default -> EdgeDomainIndexingState.ERROR; }; } + + class DocumentDisqualifier { + int count; + int goodCount; + + void offer(double quality) { + count++; + if (quality > minAvgDocumentQuality) { + goodCount++; + } + } + + boolean isQualified() { + return count < 25 || goodCount*10 >= count; + } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java new file mode 100644 index 00000000..7628e09a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java @@ -0,0 +1,71 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; + +import java.util.*; + +public class CommonKeywordExtractor { + private final PorterStemmer ps = new PorterStemmer(); + + private static final int MIN_REQUIRED_DOCUMENTS = 25; + + private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100; + private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25; + + private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5; + + public List getCommonSiteWords(ProcessedDomain ret, IndexBlock... sourceBlocks) { + + if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS) + return Collections.emptyList(); + + final Map wordToStemmedMemoized = new HashMap<>(ret.documents.size()*10); + + final Map topStemmedKeywordCount = new HashMap<>(ret.documents.size()*10); + final Map> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10); + + int qualifiedDocCount = 0; + for (var doc : ret.documents) { + if (doc.words == null) + continue; + + qualifiedDocCount++; + + for (var block : sourceBlocks) { + for (var word : doc.words.get(block).words) { + String wordStemmed = wordToStemmedMemoized.computeIfAbsent(word, ps::stemWord); + + // Count by negative values to sort by Map.Entry.comparingByValue() in reverse + topStemmedKeywordCount.merge(wordStemmed, -1, Integer::sum); + + stemmedToNonstemmedVariants.computeIfAbsent(wordStemmed, w -> new HashSet<>()).add(word); + } + } + } + + int totalValue = 0; + for (int value : topStemmedKeywordCount.values()) { + totalValue += value; + } + + if (totalValue > -REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION) + return Collections.emptyList(); + + List topWords = new ArrayList<>(MAX_SITE_KEYWORDS_TO_EXTRACT); + + double qualifyingValue = -qualifiedDocCount * QUALIFYING_PROPORTION_FOR_KEYWORD; + + topStemmedKeywordCount.entrySet().stream() + .filter(e -> e.getValue() < qualifyingValue) + .sorted(Map.Entry.comparingByValue()) + .limit(MAX_SITE_KEYWORDS_TO_EXTRACT) + .forEach(e -> topWords.addAll(stemmedToNonstemmedVariants.get(e.getKey()))); + + + return topWords; + + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java index d6d2d8f4..5b875442 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java @@ -14,6 +14,8 @@ public enum HtmlFeature { ADVERTISEMENT("special:ads"), CATEGORY_CRAFTS("category:crafts"), + + UNKNOWN("special:uncategorized") ; private final String keyword; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java index acc9708c..6cd2d53a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java @@ -13,10 +13,14 @@ import java.io.InputStreamReader; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; public class CrawledDomainReader { private final Gson gson = new GsonBuilder().create(); + private final ForkJoinPool pool = new ForkJoinPool(4); + public CrawledDomainReader() { } @@ -43,7 +47,12 @@ public class CrawledDomainReader { if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) { domain = gson.fromJson(nextLine, CrawledDomain.class); } else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - docs.add(gson.fromJson(nextLine, CrawledDocument.class)); + pool.execute(() -> { + var doc = gson.fromJson(nextLine, CrawledDocument.class); + synchronized (docs) { + docs.add(doc); + } + }); } } else if (line.charAt(0) == '{') { domain = gson.fromJson(line, CrawledDomain.class); @@ -52,6 +61,8 @@ public class CrawledDomainReader { } } + pool.awaitQuiescence(10, TimeUnit.SECONDS); + if (domain == null) { return null; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDocumentStatus.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDocumentStatus.java index 38f17abd..f65da4da 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDocumentStatus.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDocumentStatus.java @@ -6,5 +6,6 @@ public enum CrawlerDocumentStatus { BAD_CHARSET, REDIRECT, ROBOTS_TXT, - ERROR + ERROR, + Timeout } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 00736de7..4c463307 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -1,5 +1,6 @@ package nu.marginalia.wmsa.edge.data.dao; +import com.google.common.base.Strings; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.util.concurrent.UncheckedExecutionException; @@ -113,9 +114,12 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { Double.MAX_VALUE, // termScore 0 // queryLength ); - if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) { - result.add(val); + if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF + && Strings.isNullOrEmpty(val.description) + && val.url.path.length() > 1) { + continue; } + result.add(val); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index 939b625b..446e9ce3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -6,7 +6,6 @@ import com.google.inject.Inject; import com.google.inject.name.Named; import com.google.protobuf.InvalidProtocolBufferException; import gnu.trove.map.TLongIntMap; -import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TLongIntHashMap; import gnu.trove.set.hash.TIntHashSet; import io.prometheus.client.Histogram; @@ -227,12 +226,7 @@ public class EdgeIndexService extends Service { long start = System.currentTimeMillis(); try { - if (specsSet.isStagger()) { - return new EdgeSearchResultSet(searchStaggered(specsSet)); - } - else { - return new EdgeSearchResultSet(searchStraight(specsSet)); - } + return new EdgeSearchResultSet(searchStraight(specsSet)); } catch (HaltException ex) { logger.warn("Halt", ex); @@ -249,59 +243,9 @@ public class EdgeIndexService extends Service { } } - private Map> searchStaggered(EdgeSearchSpecification specsSet) { - int count = 0; - - final Map> results = new HashMap<>(); - final TIntHashSet seenResults = new TIntHashSet(); - - final DomainResultCountFilter[] domainCountFilter = new DomainResultCountFilter[] { - new DomainResultCountFilter(specsSet.limitByDomain), - new DomainResultCountFilter(specsSet.limitByDomain) - }; - - final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS); - final TIntIntHashMap limitsPerBucketRemaining = new TIntIntHashMap(6, 0.7f, 0, specsSet.limitByBucket); - - for (int i = 0; i < specsSet.buckets.size(); i+=2) { - for (var sq : specsSet.subqueries) { - for (int j = 0; j < 2 && i + j < specsSet.buckets.size(); j++) { - Optional searchTerms = getSearchTerms(sq); - - if (searchTerms.isEmpty()) - continue; - - var result = performSearch(searchTerms.get(), - budget, - seenResults, - domainCountFilter[j], - sq, - List.of(specsSet.buckets.get(i+j)), - specsSet, - Math.min(limitsPerBucketRemaining.get(i+j), specsSet.limitTotal - count) - ); - - if (logger.isDebugEnabled()) { - logger.debug("{} -> {} {} {}", sq.block, specsSet.buckets.get(i+j), sq.searchTermsInclude, result.results.values().stream().mapToInt(List::size).sum()); - } - - int sz = result.size(); - count += sz; - limitsPerBucketRemaining.adjustOrPutValue(i+j, -sz, specsSet.limitByBucket-sz); - - if (sz > 0) { - results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result); - } - } - } - } - - return results; - } - @NotNull - private Map> searchStraight(EdgeSearchSpecification specsSet) { - Map> results = new HashMap<>(); + private Map> searchStraight(EdgeSearchSpecification specsSet) { + Map> results = new HashMap<>(); int count = 0; TIntHashSet seenResults = new TIntHashSet(); @@ -314,25 +258,38 @@ public class EdgeIndexService extends Service { if (searchTerms.isEmpty()) continue; - var result = performSearch(searchTerms.get(), + var resultForSq = performSearch(searchTerms.get(), budget, seenResults, domainCountFilter, sq, specsSet.buckets, specsSet, specsSet.limitTotal - count); if (logger.isDebugEnabled()) { - logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, result.size()); + logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, resultForSq.size()); } - count += result.size(); - if (result.size() > 0) { - results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result); + count += resultForSq.size(); + if (resultForSq.size() > 0) { + results.computeIfAbsent(sq.block, s -> new ArrayList<>()).addAll(resultForSq); } } + + List> distinctSearchTerms = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); + + results.forEach((index, blockResults) -> { + for (var result : blockResults) { + for (int i = 0; i < distinctSearchTerms.size(); i++) { + for (var term : distinctSearchTerms.get(i)) { + result.scores.add(getSearchTermScore(i, result.bucketId, term, result.getCombinedId())); + } + } + } + }); + return results; } - private EdgeSearchResults performSearch(EdgeIndexSearchTerms searchTerms, + private List performSearch(EdgeIndexSearchTerms searchTerms, IndexSearchBudget budget, TIntHashSet seenResults, DomainResultCountFilter domainCountFilter, @@ -342,14 +299,14 @@ public class EdgeIndexService extends Service { int limit) { if (limit <= 0) { - return new EdgeSearchResults(); + return new ArrayList<>(); } - final Map> results = new HashMap<>(); + final List results = new ArrayList<>(); final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain); for (int i : specBuckets) { - int foundResultsCount = results.values().stream().mapToInt(List::size).sum(); + int foundResultsCount = results.size(); if (foundResultsCount >= specs.limitTotal || foundResultsCount >= limit) break; @@ -362,38 +319,33 @@ public class EdgeIndexService extends Service { .limit(specs.limitTotal * 3L) .distinct() .limit(Math.min(specs.limitByBucket - - results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount)) + - results.size(), limit - foundResultsCount)) .forEach(resultsForBucket::add); for (var result : resultsForBucket) { seenResults.add(result.url.id()); } - for (var result : resultsForBucket) { - for (var searchTerm : sq.searchTermsInclude) { - result.scores.add(getSearchTermScore(i, searchTerm, result.getCombinedId())); - } - } domainCountFilter.addAll(i, resultsForBucket); - if (!resultsForBucket.isEmpty()) { - results.put(i, resultsForBucket); - } + results.addAll(resultsForBucket); } - return new EdgeSearchResults(results); + return results; } - private EdgeSearchResultKeywordScore getSearchTermScore(int bucketId, String term, long urlId) { + private EdgeSearchResultKeywordScore getSearchTermScore(int set, int bucketId, String term, long urlId) { final int termId = indexes.getDictionaryReader().get(term); var bucket = indexes.getBucket(bucketId); - return new EdgeSearchResultKeywordScore(term, + return new EdgeSearchResultKeywordScore(set, term, bucket.getTermScore(termId, urlId), bucket.isTermInBucket(IndexBlock.Title, termId, urlId), - bucket.isTermInBucket(IndexBlock.Link, termId, urlId) + bucket.isTermInBucket(IndexBlock.Link, termId, urlId), + bucket.isTermInBucket(IndexBlock.Site, termId, urlId), + bucket.isTermInBucket(IndexBlock.Subjects, termId, urlId) ); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index 1d0915aa..b746516a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -2,20 +2,20 @@ package nu.marginalia.wmsa.edge.index.model; public enum IndexBlock { TitleKeywords(0, 0), - Title(1, 1), + Title(1, 0), Link(2, 1.15), - Subjects(3, 3.0), + Subjects(3, 1.0), NamesWords(4, 3.0), Artifacts(5, 10), Meta(6, 7), - Tfidf_Top(7, 0.5), - Tfidf_Middle(8, 1.25), - Tfidf_Lower(9, 1.5), + Tfidf_Top(7, 1.5), + Tfidf_Middle(8, 2), + Tfidf_Lower(9, 3.5), - Words_1(10, 3.0), + Words_1(10, 2.0), Words_2(11, 3.5), Words_4(12, 4.0), Words_8(13, 4.5), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 56df6ddf..195828e4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -47,7 +47,7 @@ public class SearchIndexReader implements AutoCloseable { var linkIndex = indices.get(IndexBlock.Link); var titleIndex = indices.get(IndexBlock.Title); var namesIndex = indices.get(IndexBlock.NamesWords); - var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords); + var siteIndex = indices.get(IndexBlock.Site); var metaIndex = indices.get(IndexBlock.Meta); var topicIndex = indices.get(IndexBlock.Subjects); @@ -61,14 +61,17 @@ public class SearchIndexReader implements AutoCloseable { queryBuilders = new EnumMap<>(IndexBlock.class); underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class); + queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, linkIndex), words1)); queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1), words1)); queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words2), words1)); queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words4), words1)); queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words8), words1)); queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1)); - underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, namesIndex, topicIndex, metaIndex), words1)); - underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(linkIndex, namesIndex, topIndex, midIndex, lowIndex, topicIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, siteIndex, namesIndex, topicIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, namesIndex, siteIndex, midIndex, topicIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Middle, new IndexQueryBuilder(listOfNonNulls(midIndex, linkIndex, namesIndex, topIndex, siteIndex, midIndex, lowIndex, topicIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Lower, new IndexQueryBuilder(listOfNonNulls(midIndex, linkIndex, namesIndex, topIndex, siteIndex, midIndex, lowIndex, topicIndex, metaIndex, artifacts), words1)); } @SafeVarargs diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java index 78e132b3..f10bebd5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java @@ -46,7 +46,7 @@ public class IndexQueryBuilder { return new QueryForIndices(budget, LongStream::empty); } else if (relevantIndices.length == 1 || relevantIndices[0] != 0) { - return build(budget, filter, wordId); + return new QueryForIndices(budget, LongStream::empty); } var fstRange = requiredIndices.get(relevantIndices[0]).rangeForWord(wordId); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java index 28b4255b..037c12f3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java @@ -16,6 +16,7 @@ import java.io.IOException; import java.nio.file.Path; import java.util.Iterator; import java.util.function.Consumer; +import java.util.function.Predicate; import java.util.stream.Stream; @AllArgsConstructor @NoArgsConstructor @ToString @@ -86,7 +87,21 @@ public class EdgeCrawlPlan { throw new RuntimeException(ex); } } + public void forEachCrawledDomain(Predicate idReadPredicate, Consumer consumer) { + final CrawledDomainReader reader = new CrawledDomainReader(); + try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { + entryStream + .filter(entry -> idReadPredicate.test(entry.id())) + .map(CrawlLogEntry::path) + .map(this::getCrawledFilePath) + .map(reader::readRuntimeExcept) + .forEach(consumer); + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } @MustBeClosed public DomainsIterable domainsIterable() throws IOException { return new DomainsIterable(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java index d8c66fc2..8acaf0d0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java @@ -13,18 +13,18 @@ import java.util.List; @AllArgsConstructor @ToString @Getter @EqualsAndHashCode public class EdgeSearchResultItem { - public final int blockId; + public final int bucketId; public final int queryLength; public final EdgeId domain; // this isn't the external domain ID, but a ranking public final EdgeId url; public final List scores; - public EdgeSearchResultItem(int blockId, int queryLength, long val) { + public EdgeSearchResultItem(int bucketId, int queryLength, long val) { int urlId = (int) (val & 0xFFFF_FFFFL); int domainId = (int) (val >>> 32); this.queryLength = queryLength; - this.blockId = blockId; + this.bucketId = bucketId; url = new EdgeId<>(urlId); domain = new EdgeId<>(domainId); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java index e20dfbcd..f40b53b5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java @@ -1,14 +1,6 @@ package nu.marginalia.wmsa.edge.model.search; -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.ToString; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -@AllArgsConstructor @ToString @EqualsAndHashCode -public class EdgeSearchResultKeywordScore { - public final String keyword; - public final IndexBlock index; - public boolean title; - public boolean link; +public record EdgeSearchResultKeywordScore(int set, String keyword, IndexBlock index, boolean title, boolean link, boolean site, boolean subject) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultSet.java index 59bae816..6356bd8b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultSet.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultSet.java @@ -10,7 +10,7 @@ import java.util.Map; @AllArgsConstructor @Getter @ToString public class EdgeSearchResultSet { - public Map> resultsList; + public Map> resultsList; public int size() { return resultsList.values().stream().mapToInt(List::size).sum(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResults.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResults.java index e23496c4..00b51df4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResults.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResults.java @@ -4,29 +4,23 @@ import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import java.util.HashMap; +import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; import java.util.stream.Stream; @AllArgsConstructor @Getter @ToString public class EdgeSearchResults { - public final Map> results; + public final List results; public EdgeSearchResults() { - results = new HashMap<>(); + results = new ArrayList<>(); } public int size() { - return results.values().stream().mapToInt(List::size).sum(); + return results.size(); } public Stream stream() { - return results.values().stream().flatMap(List::stream); - } - - public List getAllItems() { - return stream().collect(Collectors.toList()); + return results.stream(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java index 0a7cd0c9..942d1f19 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java @@ -94,7 +94,7 @@ public class EdgeUrlDetails { } public double getRanking() { - double lengthAdjustment = Math.max(1, words / (words + 1000.)); + double lengthAdjustment = Math.max(1, words / (words + 10000.)); return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore()))); } @@ -132,6 +132,7 @@ public class EdgeUrlDetails { public boolean isCookies() { return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES); } + public boolean isUnknown() { return HtmlFeature.hasFeature(features, HtmlFeature.UNKNOWN); } public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); } public boolean isSpecialDomain() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index 730f8737..339b9f21 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -39,6 +39,7 @@ import javax.annotation.Nullable; import java.util.*; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; import java.util.stream.Collectors; @Singleton @@ -236,6 +237,8 @@ public class EdgeSearchOperator { } + private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}"); + private EdgePageScoreAdjustment adjustScoreBasedOnQuery(EdgeUrlDetails p, EdgeSearchSpecification specs) { String titleLC = p.title == null ? "" : p.title.toLowerCase(); String descLC = p.description == null ? "" : p.description.toLowerCase(); @@ -248,11 +251,16 @@ public class EdgeSearchOperator { .toArray(String[]::new); int termCount = searchTermsLC.length; - String[] titleParts = titleLC.split("[:!|./]|(\\s-|-\\s)|\\s{2,}"); double titleHitsAdj = 0.; + final String[] titleParts = titleSplitPattern.split(titleLC); for (String titlePart : titleParts) { - titleHitsAdj += Arrays.stream(searchTermsLC).filter(titlePart::contains).mapToInt(String::length).sum() - / (double) Math.max(1, titlePart.trim().length()); + double hits = 0; + for (String term : searchTermsLC) { + if (titlePart.contains(term)) { + hits += term.length(); + } + } + titleHitsAdj += hits / Math.max(1, titlePart.length()); } double titleFullHit = 0.; @@ -299,10 +307,8 @@ public class EdgeSearchOperator { logger.debug("{}", resultSet); for (IndexBlock block : indexBlockSearchOrder) { - for (var results : resultSet.resultsList.getOrDefault(block, Collections.emptyList())) { - var items = results.getAllItems(); - queryResults.append(100, resultDecorator.decorateSearchResults(items, block, deduplicator)); - } + queryResults.append(100, resultDecorator.decorateSearchResults(resultSet.resultsList.getOrDefault(block, Collections.emptyList()), + block, deduplicator)); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index fb607e16..df19ef16 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -10,31 +10,31 @@ import java.util.stream.Collectors; public enum EdgeSearchProfile { DEFAULT("default", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Link, + List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus ), 0, 1), MODERN("modern", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords, + List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus ), 2), CORPO("corpo", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, + List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), 4, 5, 7), YOLO("yolo", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, + List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), 0, 2, 1, 3, 4, 6), CORPO_CLEAN("corpo-clean", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), + List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), 4, 5), ACADEMIA("academia", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), + List.of( IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), 3), FOOD("food", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords), + List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords), 2, 0), ; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java index ce1aea9e..aeba3dc5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java @@ -1,7 +1,7 @@ package nu.marginalia.wmsa.edge.search.query; import com.google.inject.Inject; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,12 +13,12 @@ import java.util.stream.Collectors; public class EnglishDictionary { private final Set englishWords = new HashSet<>(); - private final TermFrequencyDict dict; + private final NGramBloomFilter bloomFilter; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public EnglishDictionary(TermFrequencyDict dict) { - this.dict = dict; + public EnglishDictionary(NGramBloomFilter bloomFilter) { + this.bloomFilter = bloomFilter; try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"), "Could not load word frequency table"); var br = new BufferedReader(new InputStreamReader(resource)) @@ -44,10 +44,9 @@ public class EnglishDictionary { public Collection getWordVariants(String s) { var variants = findWordVariants(s); - long freqBaseline = dict.getTermFreq(s); var ret = variants.stream() - .filter(var -> freqBaseline*10 > dict.getTermFreq(var) && freqBaseline/10 < dict.getTermFreq(var) + .filter(bloomFilter::isKnownNGram ).collect(Collectors.toList()); if (s.equals("recipe") || s.equals("recipes")) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index a602f620..c50839de 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -130,7 +130,7 @@ public class QueryFactory { } } - EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords); + EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.Title); params.profile().addTacitTerms(subquery); params.jsSetting().addTacitTerms(subquery); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java index 12d358bf..0608fbae 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java @@ -101,13 +101,27 @@ public class SearchResultDecorator { if (!missedIds.isEmpty()) { logger.warn("Could not look up documents: {}", missedIds.toArray()); } - retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore)); + retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore) + .thenComparing(url -> url.url.path.length())); return retList; } private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) { - return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength) + int titleLength = details.title.length(); + + double value = valuator.evaluateTerms(resultItem.scores, block, details.words,titleLength) / Math.sqrt(1 + resultItem.queryLength) + ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0); + + System.out.println("---"); + System.out.println(details.getUrl()); + System.out.println(details.getTitle()); + System.out.println(details.words); + for (var score : resultItem.scores) { + System.out.println(block + ":" + score); + } + System.out.println(value); + + return value; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java index a1c3bc58..f6a8581f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java @@ -16,8 +16,8 @@ public class SearchResultValuator { private static final Pattern separator = Pattern.compile("_"); - private static final int MIN_LENGTH = 500; - private static final int AVG_LENGTH = 1400; + private static final int MIN_LENGTH = 2000; + private static final int AVG_LENGTH = 5000; @Inject public SearchResultValuator(TermFrequencyDict dict) { @@ -26,58 +26,85 @@ public class SearchResultValuator { // This is basically a bargain bin BM25 - public double evaluateTerms(List rawScores, IndexBlock block, int length) { - EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new); + public double evaluateTerms(List rawScores, IndexBlock block, int length, int titleLength) { + int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0); - if (scores.length == 0) { - return IndexBlock.Words_1.sortOrder; - } + double bestScore = 1000; + double bestLtsFactor = 1.; - final double[] weights = getTermWeights(scores); - final double lengthPenalty = getLengthPenalty(length); + for (int set = 0; set <= sets; set++) { + int thisSet = set; + EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new); - double termSum = 0.; - double factorSum = 0.; - - for (int i = 0; i < scores.length; i++) { - - final double factor = 1. / (1.0 + weights[i]); - - factorSum += factor; - - double termValue = (scores[i].index.sortOrder + 0.5) * factor; - - if (!scores[i].link && !scores[i].title) { - termValue *= lengthPenalty; - } - else if (scores[i].link) { - termValue /= 4.75; + if (scores.length == 0) { + continue; } - termSum += termValue; + final double[] weights = getTermWeights(scores); + final double lengthPenalty = getLengthPenalty(length); + + double termSum = 0.; + double factorSum = 0.; + + double ltsFactor = 1.0; + + for (int i = 0; i < scores.length; i++) { + + final double factor = 1. / (1.0 + weights[i]); + + factorSum += factor; + + double termValue = (scores[i].index().sortOrder + 0.5) * factor; + + termValue /= lengthPenalty; + + if (scores[i].link()) { + ltsFactor *= Math.pow(0.5, 1. / scores.length); + } + if (scores[i].title()) { + if (titleLength <= 64) { + ltsFactor *= Math.pow(0.5, 1. / scores.length); + } + else if (titleLength < 96) { + ltsFactor *= Math.pow(0.75, 1. / scores.length); + } + else { + ltsFactor *= Math.pow(0.9, 1. / scores.length); + } + } + if (scores[i].subject()) { + ltsFactor *= Math.pow(0.8, 1. / scores.length); + } + + termSum += termValue; + } + + assert factorSum != 0; + + double value = termSum / factorSum; + + bestLtsFactor = Math.min(bestLtsFactor, ltsFactor); + bestScore = Math.min(bestScore, value); } - assert factorSum != 0 ; - - if (block == IndexBlock.Title || block == IndexBlock.TitleKeywords) { - return block.sortOrder + (termSum / factorSum) / 5; - } - - return termSum / factorSum; + return (0.7+0.3*block.sortOrder)*bestScore * bestLtsFactor; } private double getLengthPenalty(int length) { if (length < MIN_LENGTH) { length = MIN_LENGTH; } - return (0.7 + 0.3 * length / AVG_LENGTH); + if (length > AVG_LENGTH) { + length = AVG_LENGTH; + } + return (0.5 + 0.5 * length / AVG_LENGTH); } private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) { double[] weights = new double[scores.length]; for (int i = 0; i < scores.length; i++) { - String[] parts = separator.split(scores[i].keyword); + String[] parts = separator.split(scores[i].keyword()); double sumScore = 0.; int count = 0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java index 58fff3c6..c22f5ddd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java @@ -3,21 +3,35 @@ package nu.marginalia.wmsa.edge.tools; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.ConverterModule; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; +import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; +import java.util.concurrent.ForkJoinPool; public class ConverterLogicTestTool { private final Logger logger = LoggerFactory.getLogger(getClass()); + DomPruner domPruner = new DomPruner(); + RecipeDetector recipeDetector = new RecipeDetector(); + WoodworkingDetector woodworkingDetector = new WoodworkingDetector(); + TextileCraftDetector textileCraftDetector = new TextileCraftDetector(); + + SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + public static void main(String... args) throws IOException { if (args.length != 1) { @@ -38,19 +52,42 @@ public class ConverterLogicTestTool { EdgeCrawlPlan plan, DomainProcessor processor ) throws Exception { + var cp = new ForkJoinPool(16); plan.forEachCrawledDomain(domain -> { - var ret = processor.process(domain); - ret.documents.forEach(doc -> { - if (doc.words == null) - return; - var artifacts = doc.words.get(IndexBlock.Artifacts); - if (artifacts.size() > 0) { - System.out.println(doc.url + ": " + artifacts); - } - }); - }); + if (domain.doc == null) return; + + for (var doc : domain.doc) { + if (doc.documentBody == null) continue; + + Runnable task = () -> { + var parsed = Jsoup.parse(doc.documentBody); + + domPruner.prune(parsed, 0.5); + var dld = se.extractSentences(parsed); + + if (dld.totalNumWords() < 250) + return; + + if (textileCraftDetector.testP(dld) > 0.3) { + System.out.println("textilecraft\t" + doc.url); + } + if (woodworkingDetector.testP(dld) > 0.2) { + System.out.println("woodworking\t" + doc.url); + } + if (recipeDetector.testP(dld) > 0.5) { + System.out.println("recipe\t" + doc.url); + } + }; + + if (cp.getQueuedSubmissionCount() > 32) { + task.run(); + } else { + cp.execute(task); + } + } + }); } } diff --git a/marginalia_nu/src/main/resources/dictionary/en-stopwords b/marginalia_nu/src/main/resources/dictionary/en-stopwords index cdcd342d..d97db17c 100644 --- a/marginalia_nu/src/main/resources/dictionary/en-stopwords +++ b/marginalia_nu/src/main/resources/dictionary/en-stopwords @@ -151,15 +151,6 @@ i've it's it i'm -1 -2 -3 -4 -5 -6 -7 -8 -9 . .. ... diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java index d839bbb2..bc628b13 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java @@ -42,7 +42,7 @@ class SqlLoadDomainLinksTest { @Test public void loadDomainLinks() { var loader = new SqlLoadDomainLinks(dataSource); - loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) }); + loader.load(loaderData, new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) }); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java index 792a4221..0bd22764 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java @@ -113,38 +113,13 @@ class SentenceExtractorTest { var dict = new TermFrequencyDict(lm); DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); - -// documentKeywordExtractorLegacy.setLegacy(true); - -// for (;;) { - long st = System.currentTimeMillis(); - for (var file : Objects.requireNonNull(data.toFile().listFiles())) { - - - var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); - - var newRes = documentKeywordExtractor.extractKeywords(newResult); - - -// var legacyRes = documentKeywordExtractorLegacy.extractKeywords(newResult); -// -// EdgePageWordSet difference = new EdgePageWordSet(); -// for (IndexBlock block : IndexBlock.values()) { - -// var newWords = new HashSet<>(newRes.get(block).words); -// var oldWords = new HashSet<>(legacyRes.get(block).words); -// newWords.removeAll(oldWords); - -// if (!newWords.isEmpty()) { -// difference.append(block, newWords); -// } -// } -// System.out.println(difference); - System.out.println(newRes); -// System.out.println("---"); - } - System.out.println(System.currentTimeMillis() - st); -// } + long st = System.currentTimeMillis(); + for (var file : Objects.requireNonNull(data.toFile().listFiles())) { + var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); + var newRes = documentKeywordExtractor.extractKeywords(newResult); + System.out.println(newRes); + } + System.out.println(System.currentTimeMillis() - st); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java deleted file mode 100644 index da9206bf..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java +++ /dev/null @@ -1,156 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.edge.index.EdgeIndexService; -import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; -import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.parallel.Execution; -import org.junit.jupiter.api.parallel.ExecutionMode; -import org.junit.jupiter.api.parallel.ResourceAccessMode; -import org.junit.jupiter.api.parallel.ResourceLock; -import spark.Spark; - -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -import static nu.marginalia.util.TestUtil.getConnection; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) -@Execution(ExecutionMode.SAME_THREAD) -@Tag("db") -public class EdgeIndexClientTest { - private static HikariDataSource dataSource; - private static EdgeIndexService service; - private static EdgeIndexClient client; - private static Path tempDir; - private static SearchIndexes indexes; - - @SneakyThrows - public static HikariDataSource provideConnection() { - return getConnection(); - } - - static final int testPort = TestUtil.getPort(); - - @SneakyThrows - @BeforeAll - public static void setUpClass() { - Spark.port(testPort); - System.setProperty("service-name", "edge-index"); - - dataSource = provideConnection(); - dataSource.setKeepaliveTime(100); - dataSource.setIdleTimeout(100); - client = new EdgeIndexClient(); - client.setServiceRoute("127.0.0.1", testPort); - - tempDir = Files.createTempDirectory("EdgeIndexClientTest"); - - var servicesFactory = new IndexServicesFactory(tempDir,tempDir,tempDir,tempDir, - "writer-index", - "writer-dictionary", - "index-words-read", - "index-urls-read", - "index-words-write", - "index-urls-write", - 1L<<24, - id->false, - new SearchIndexPartitioner(null) - ); - - var init = new Initialization(); - indexes = new SearchIndexes(servicesFactory, new SearchIndexPartitioner(null)); - service = new EdgeIndexService("127.0.0.1", - testPort, - init, null, - indexes, - servicesFactory); - - Spark.awaitInitialization(); - init.setReady(); - } - - @Test - public void testMultiBucketHit() { - putWords(1, 1, -2, "fancy", "anagram", "dilbert", "whoah", "engram"); - putWords(2, 2, -5, "quibble", "angry", "whoah", "fancy"); - putWords(3, 3, -0.01, "strong", "manly", "muscles"); - indexes.repartition(); - indexes.preconvert(); - indexes.reindexAll(); - - var results = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("fancy")).resultsList.get(IndexBlock.Title).get(0).results; - System.out.println(results); - List> flatResults = results.values().stream().flatMap(List::stream).map(rs -> rs.url).collect(Collectors.toList()); - - assertEquals(2, flatResults.size()); - assertTrue(flatResults.contains(new EdgeId(1))); - assertTrue(flatResults.contains(new EdgeId(2))); - } - - @Test - public void testHighHit() { - putWords(2, 5, -100, "trapphus"); - indexes.repartition(); - indexes.preconvert(); - indexes.reindexAll(); - var rsp = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("trapphus")); - System.out.println(rsp); - assertEquals(5, rsp.resultsList.get(IndexBlock.Title).get(0).results.get(0).get(0).url.id()); - } - - - @Test - public void testSearchDomain() { - putWords(8, 1, -2, "domain"); - putWords(8, 2, -5, "domain"); - putWords(10, 3, -0.01, "domain"); - putWords(11, 3, -0.01, "domain"); - putWords(12, 3, -0.01, "domain"); - indexes.repartition(); - indexes.preconvert(); - indexes.reindexAll(); - - var results = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("fancy")).resultsList.get(IndexBlock.Title).get(0).results; - System.out.println(results); - List> flatResults = results.values().stream().flatMap(List::stream).map(rs -> rs.url).collect(Collectors.toList()); - - assertEquals(2, flatResults.size()); - assertTrue(flatResults.contains(new EdgeId(1))); - assertTrue(flatResults.contains(new EdgeId(2))); - } - - void putWords(int didx, int idx, double quality, String... words) { - EdgePageWords epw = new EdgePageWords(IndexBlock.Title); - epw.addAll(Arrays.asList(words)); - client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx), - new EdgePageWordSet(epw), 0).blockingSubscribe(); - } - - @AfterAll - public static void tearDownClass() { - nu.marginalia.util.test.TestUtil.clearTempDir(tempDir); - } - -} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java index 4e1bc2b6..28dab56f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java @@ -24,7 +24,7 @@ class BodyQueryParserTest { public static void init() throws IOException { dict = new TermFrequencyDict(lm); nGramBloomFilter = new NGramBloomFilter(lm); - englishDictionary = new EnglishDictionary(dict); + englishDictionary = new EnglishDictionary(nGramBloomFilter); } @BeforeEach diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionaryTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionaryTest.java deleted file mode 100644 index f6a5b66f..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionaryTest.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.wmsa.edge.search.query; - -import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import org.junit.jupiter.api.Test; - -class EnglishDictionaryTest { - - @Test - void getWordVariants() { - LanguageModels lm = TestLanguageModels.getLanguageModels(); - - var dict = new TermFrequencyDict(lm); - new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println); - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java index 5d2a2f83..2352af80 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java @@ -21,7 +21,7 @@ class QueryParserTest { public void setUp() throws IOException { dict = new TermFrequencyDict(lm); nGramBloomFilter = new NGramBloomFilter(lm); - englishDictionary = new EnglishDictionary(dict); + englishDictionary = new EnglishDictionary(nGramBloomFilter); parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary)); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index 597341bc..f99589c5 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -23,8 +23,8 @@ class QueryVariantsTest { var dict = new TermFrequencyDict(lm); var ngrams = new NGramBloomFilter(lm); - variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(dict)); - parser = new QueryParser(new EnglishDictionary(dict), variants); + variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(ngrams)); + parser = new QueryParser(new EnglishDictionary(ngrams), variants); } @Test From 3fd48e0e53f0892aff0197e5046e9eacb2c076b8 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 2 Sep 2022 10:41:02 +0200 Subject: [PATCH 09/19] Cleaning the code a bit, fix URL loading bug with multiple fragments in URL --- .../converting/processor/DomainProcessor.java | 17 ----------------- .../marginalia/wmsa/edge/model/EdgeUrl.java | 19 +++++++++++++++---- .../wmsa/edge/model/EdgeUrlTest.java | 1 + 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index c12f992a..951dc274 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.converting.processor; import com.google.common.base.Strings; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; @@ -110,22 +109,6 @@ public class DomainProcessor { } - private double getAverageQuality(List documents) { - int n = 0; - double q = 0.; - for (var doc : documents) { - if (doc.quality().isPresent()) { - n++; - q += doc.quality().getAsDouble(); - } - } - - if (n > 0) { - return q / n; - } - return -5.; - } - private EdgeDomainIndexingState getState(String crawlerStatus) { return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) { case OK -> EdgeDomainIndexingState.ACTIVE; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index 2ba9234c..957769bf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -41,24 +41,35 @@ public class EdgeUrl implements WideHashable { private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]"); + /* Java's URI parser is a bit too strict in throwing exceptions when there's an error. + + Here on the Internet, standards are like the picture on the box of the frozen pizza, + and what you get is more like what's on the inside, we try to patch things instead, + just give it a best-effort attempt att cleaning out broken or unnecessary constructions + like bad or missing URLEncoding + */ public static String urlencodeFixer(String url) throws URISyntaxException { var s = new StringBuilder(); String goodChars = "&.?:/-;+$#"; String hexChars = "0123456789abcdefABCDEF"; int pathIdx = findPathIdx(url); - if (pathIdx < 0) { - return url; + if (pathIdx < 0) { // url looks like http://marginalia.nu + return url + "/"; } s.append(url, 0, pathIdx); - for (int i = pathIdx; i < url.length(); i++) { + // We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason + int end = url.indexOf("#"); + if (end < 0) end = url.length(); + + for (int i = pathIdx; i < end; i++) { int c = url.charAt(i); if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { s.appendCodePoint(c); } - else if (c == '%' && i+2= 0 && hexChars.indexOf(cnn) >= 0) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java index 09498160..61444c69 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java @@ -27,6 +27,7 @@ class EdgeUrlTest { } @Test void urlencodeFixer() throws URISyntaxException { + System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\"")); From ccf79f47b0b23380fcbc4fd675ed0fbbbefcf7b6 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 2 Sep 2022 14:51:11 +0200 Subject: [PATCH 10/19] Preparation for conversion --- .../assistant/dict/TermFrequencyDict.java | 5 +- .../processor/DocumentProcessor.java | 19 +++---- .../{DomPruner.java => DomPruningFilter.java} | 15 ++---- .../processor/logic/LinkParser.java | 52 ++++++++++++------- .../wmsa/edge/model/EdgeDomain.java | 12 +++-- .../wmsa/edge/search/EdgeSearchProfile.java | 47 +++++++---------- .../edge/tools/ConverterLogicTestTool.java | 5 +- .../templates/edge/parts/search-form.hdb | 8 ++- .../templates/edge/search-result-metadata.hdb | 2 +- ...nerTest.java => DomPruningFilterTest.java} | 2 +- 10 files changed, 85 insertions(+), 82 deletions(-) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/{DomPruner.java => DomPruningFilter.java} (89%) rename marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/{DomPrunerTest.java => DomPruningFilterTest.java} (86%) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java index d219b30d..2a29e2a4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java @@ -7,7 +7,7 @@ import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner; +import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -87,7 +87,6 @@ public class TermFrequencyDict { var plan = new CrawlPlanLoader().load(Path.of(args[0])); ThreadLocal se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels())); - DomPruner pruner = new DomPruner(); LanguageFilter lf = new LanguageFilter(); TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); @@ -108,7 +107,7 @@ public class TermFrequencyDict { docCount.incrementAndGet(); Document parsed = Jsoup.parse(doc.documentBody); - pruner.prune(parsed, 0.5); + parsed.body().filter(new DomPruningFilter(0.5)); DocumentLanguageData dld = se.get().extractSentences(parsed); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 60d1071d..5037c791 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -171,16 +171,15 @@ public class DocumentProcessor { throw new DisqualifiedException(DisqualificationReason.FORBIDDEN); } - DomPruner domPruner = new DomPruner(); Document prunedDoc = doc.clone(); - domPruner.prune(prunedDoc, 0.5); + prunedDoc.body().filter(new DomPruningFilter(0.5)); + var dld = sentenceExtractor.extractSentences(prunedDoc); checkDocumentLanguage(dld); var ret = new ProcessedDocumentDetails(); - ret.length = getLength(doc); ret.standard = getHtmlStandard(doc); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); @@ -246,12 +245,11 @@ public class DocumentProcessor { if (linkParser.shouldIndexLink(atag)) { linkOpt.ifPresent(lp::accept); } - else if (linkOpt.isPresent()) { - if (linkParser.hasBinarySuffix(linkOpt.get().toString())) { - linkOpt.ifPresent(lp::acceptNonIndexable); - } + else { + linkOpt + .filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase())) + .ifPresent(lp::acceptNonIndexable); } - } for (var frame : doc.getElementsByTag("frame")) { linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); @@ -271,21 +269,20 @@ public class DocumentProcessor { linkTerms.add("links:"+fd.toString().toLowerCase()); linkTerms.add("links:"+fd.getDomain().toLowerCase()); } - words.append(IndexBlock.Meta, linkTerms); Set fileKeywords = new HashSet<>(100); for (var link : lp.getNonIndexableUrls()) { - if (!Objects.equals(domain, link.domain)) { + if (!domain.hasSameTopDomain(link.domain)) { continue; } synthesizeFilenameKeyword(fileKeywords, link); } - words.append(IndexBlock.Artifacts, fileKeywords); + } private void synthesizeFilenameKeyword(Set fileKeywords, EdgeUrl link) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilter.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilter.java index beb23977..1e68125f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilter.java @@ -1,6 +1,5 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; -import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; @@ -9,22 +8,14 @@ import org.jsoup.select.NodeFilter; import java.util.HashMap; import java.util.Map; -public class DomPruner { +public class DomPruningFilter implements NodeFilter { - public void prune(Document document, double pruneThreshold) { - document.filter(new PruningFilter(pruneThreshold)); - } - -} - - -class PruningFilter implements NodeFilter { + private final double pruneThreshold; private final Map data = new HashMap<>(); private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0); - private double pruneThreshold; - public PruningFilter(double pruneThreshold) { + public DomPruningFilter(double pruneThreshold) { this.pruneThreshold = pruneThreshold; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index 98be5315..06313f1d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -19,10 +19,14 @@ import java.util.regex.Pattern; public class LinkParser { private final Logger logger = LoggerFactory.getLogger(getClass()); + private final List blockPrefixList = List.of( "mailto:", "javascript:", "tel:", "itpc:", "#", "file:"); - private final List blockSuffixList = List.of( + + private final List binarySuffixList = List.of( ".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z", + ".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar", + ".com", ".bat", ".sh", ".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf", ".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", @@ -33,7 +37,7 @@ public class LinkParser { return Optional.of(l) .filter(this::shouldIndexLink) .map(this::getUrl) - .map(link -> resolveUrl(relativeBaseUrl, link)) + .map(link -> resolveRelativeUrl(relativeBaseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -44,7 +48,7 @@ public class LinkParser { public Optional parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) { return Optional.of(l) .map(this::getUrl) - .map(link -> resolveUrl(relativeBaseUrl, link)) + .map(link -> resolveRelativeUrl(relativeBaseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -74,7 +78,7 @@ public class LinkParser { @Contract(pure=true) public Optional parseLink(EdgeUrl baseUrl, String str) { return Optional.of(str) - .map(link -> resolveUrl(baseUrl, link)) + .map(link -> resolveRelativeUrl(baseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -85,7 +89,7 @@ public class LinkParser { public Optional parseFrame(EdgeUrl baseUrl, Element frame) { return Optional.of(frame) .map(l -> l.attr("src")) - .map(link -> resolveUrl(baseUrl, link)) + .map(link -> resolveRelativeUrl(baseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -95,10 +99,10 @@ public class LinkParser { @SneakyThrows private URI renormalize(URI uri) { if (uri.getPath() == null) { - return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getFragment())); + return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment())); } if (uri.getPath().startsWith("/../")) { - return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getFragment())); + return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment())); } return uri; } @@ -117,10 +121,10 @@ public class LinkParser { private static final Pattern paramSeparatorPattern = Pattern.compile("\\?"); @SneakyThrows - private String resolveUrl(EdgeUrl baseUrl, String s) { + private String resolveRelativeUrl(EdgeUrl baseUrl, String s) { // url looks like http://www.marginalia.nu/ - if (isAbsoluteDomain(s)) { + if (doesUrlStringHaveProtocol(s)) { return s; } @@ -154,8 +158,15 @@ public class LinkParser { return url.path.substring(0, lastSlash+1); } - private boolean isAbsoluteDomain(String s) { - return s.matches("^[a-zA-Z]+:.*$"); + private boolean doesUrlStringHaveProtocol(String s) { + int i = 0; + for (; i < s.length(); i++) { + if (!Character.isAlphabetic(s.charAt(i))) + break; + } + if (i == 0 || i == s.length()) + return false; + return ':' == s.charAt(i); } public boolean shouldIndexLink(Element link) { @@ -168,26 +179,29 @@ public class LinkParser { return !"noindex".equalsIgnoreCase(rel); } - public boolean hasBinarySuffix(String href) { - return blockSuffixList.stream().anyMatch(href::endsWith); - } - private boolean isUrlRelevant(String href) { if (null == href || "".equals(href)) { return false; } + if (href.length() > 128) { + return false; + } + href = href.toLowerCase(); + if (blockPrefixList.stream().anyMatch(href::startsWith)) { return false; } if (hasBinarySuffix(href)) { return false; } - if (href.length() > 128) { - return false; - } + return true; } + public boolean hasBinarySuffix(String str) { + return binarySuffixList.stream().anyMatch(str::endsWith); + } + @Nullable public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) { var baseTags = parsed.getElementsByTag("base"); @@ -196,7 +210,7 @@ public class LinkParser { for (var tag : baseTags) { String href = tag.attr("href"); if (!Strings.isNullOrEmpty(href)) { - return new EdgeUrl(resolveUrl(documentUrl, href)); + return new EdgeUrl(resolveRelativeUrl(documentUrl, href)); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index 658184c0..58a78e58 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -9,7 +9,7 @@ import java.util.regex.Pattern; @AllArgsConstructor @Getter @Setter @Builder -public class EdgeDomain implements WideHashable { +public class EdgeDomain { private static final Predicate ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate(); private static final Predicate govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate(); @@ -23,6 +23,8 @@ public class EdgeDomain implements WideHashable { public EdgeDomain(String host) { Objects.requireNonNull(host, "domain name must not be null"); + host = host.toLowerCase(); + var dot = host.lastIndexOf('.'); if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.> @@ -99,9 +101,11 @@ public class EdgeDomain implements WideHashable { return ret.toString().toLowerCase(); } - @Override - public long wideHash() { - return ((long) Objects.hash(domain, subDomain) << 32) | toString().hashCode(); + + public boolean hasSameTopDomain(EdgeDomain other) { + if (other == null) return false; + + return domain.equalsIgnoreCase(other.domain); } public boolean equals(final Object o) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index df19ef16..ca62c74d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -9,33 +9,16 @@ import java.util.List; import java.util.stream.Collectors; public enum EdgeSearchProfile { - DEFAULT("default", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus - ), - 0, 1), - MODERN("modern", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus - ), - 2), - CORPO("corpo", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), - 4, 5, 7), - YOLO("yolo", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), - 0, 2, 1, 3, 4, 6), - CORPO_CLEAN("corpo-clean", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), - 4, 5), - ACADEMIA("academia", - List.of( IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), - 3), - FOOD("food", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords), - 2, 0), + + DEFAULT("default", SearchOrder.DEFAULT_ORDER, 0, 1), + MODERN("modern", SearchOrder.DEFAULT_ORDER, 2), + CORPO("corpo", SearchOrder.DEFAULT_ORDER, 4, 5, 7), + YOLO("yolo", SearchOrder.DEFAULT_ORDER, 0, 2, 1, 3, 4, 6), + CORPO_CLEAN("corpo-clean", SearchOrder.DEFAULT_ORDER, 4, 5), + ACADEMIA("academia", SearchOrder.DEFAULT_ORDER, 3), + + FOOD("food", SearchOrder.DEFAULT_ORDER, 2, 0), + CRAFTS("crafts", SearchOrder.DEFAULT_ORDER, 2, 0), ; @@ -55,12 +38,14 @@ public enum EdgeSearchProfile { if (null == param) { return YOLO; } + return switch (param) { case "modern" -> MODERN; case "default" -> DEFAULT; case "corpo" -> CORPO; case "academia" -> ACADEMIA; case "food" -> FOOD; + case "crafts" -> CRAFTS; default -> YOLO; }; } @@ -69,6 +54,14 @@ public enum EdgeSearchProfile { if (this == FOOD) { subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); } + if (this == CRAFTS) { + subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword()); + } } } + +class SearchOrder { + static List DEFAULT_ORDER = List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java index c22f5ddd..afcb22ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java @@ -7,7 +7,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.ConverterModule; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; -import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner; +import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; @@ -25,7 +25,6 @@ public class ConverterLogicTestTool { private final Logger logger = LoggerFactory.getLogger(getClass()); - DomPruner domPruner = new DomPruner(); RecipeDetector recipeDetector = new RecipeDetector(); WoodworkingDetector woodworkingDetector = new WoodworkingDetector(); TextileCraftDetector textileCraftDetector = new TextileCraftDetector(); @@ -64,7 +63,7 @@ public class ConverterLogicTestTool { Runnable task = () -> { var parsed = Jsoup.parse(doc.documentBody); - domPruner.prune(parsed, 0.5); + parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); if (dld.totalNumWords() < 250) diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb index 839b7934..89e72d22 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb @@ -7,12 +7,18 @@
+ +
+
+ + +
+ + + +
+
+

+ """ + + title + + """ +

+
+ """ + +message+ + """ +
+
+
+

More Info

+
+ You may be able to find more information here: + +
+
+ """; + } +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroCacheTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroBTreeCachedIndexTest.java similarity index 97% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroCacheTest.java rename to marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroBTreeCachedIndexTest.java index ff05f73b..0daeecbe 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroCacheTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroBTreeCachedIndexTest.java @@ -5,7 +5,7 @@ import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; -class MicroCacheTest { +class MicroBTreeCachedIndexTest { MicroCache mc; @BeforeEach diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java similarity index 54% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepTest.java rename to marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java index b890ea34..3dc2d57c 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java @@ -1,13 +1,15 @@ package nu.marginalia.wmsa.edge.index.reader.query.types; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; import org.junit.jupiter.api.Test; import java.util.List; -class QueryFilterStepTest { - QueryFilterStep even = new QueryFilterStepFromPredicate(l -> (l%2) == 0); - QueryFilterStep divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0); - QueryFilterStep either = QueryFilterStep.anyOf(List.of(even, divisibleByThree)); +class QueryFilterStepIfTest { + QueryFilterStepIf even = new QueryFilterStepFromPredicate(l -> (l%2) == 0); + QueryFilterStepIf divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0); + QueryFilterStepIf either = QueryFilterStepIf.anyOf(List.of(even, divisibleByThree)); @Test public void test() { long[] values = new long[100]; From 971089bad3d8e8fa76622f1aeea4898b6b82c519 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sun, 11 Sep 2022 11:58:39 +0200 Subject: [PATCH 18/19] Cleaning up. --- .../wmsa/edge/search/EdgeSearchOperator.java | 219 +++--------------- .../edge/search/command/CommandEvaluator.java | 2 +- .../command/commands/ConvertCommand.java | 10 +- .../command/commands/SearchCommand.java | 14 +- ...earchCommand.java => SiteListCommand.java} | 19 +- .../edge/search/model/DomainInformation.java | 4 +- .../wmsa/edge/search/query/QueryParser.java | 4 - .../svc/EdgeSearchDomainSearchService.java | 64 +++++ .../svc/EdgeSearchQueryIndexService.java | 130 +++++++++++ .../EdgeSearchUnitConversionService.java} | 8 +- .../svc/EdgeSearchWikiArticlesService.java | 41 ++++ .../wmsa/edge/assistant/AssistantTest.java | 6 +- 12 files changed, 297 insertions(+), 224 deletions(-) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/{SiteSearchCommand.java => SiteListCommand.java} (80%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/{UnitConversion.java => svc/EdgeSearchUnitConversionService.java} (91%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchWikiArticlesService.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index b1f86b73..484f518f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -8,24 +8,18 @@ import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeIdList; -import nu.marginalia.wmsa.edge.model.id.EdgeIdSet; -import nu.marginalia.wmsa.edge.model.search.*; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; +import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; import nu.marginalia.wmsa.edge.search.model.BrowseResult; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; import nu.marginalia.wmsa.edge.search.query.QueryFactory; import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; -import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator; -import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator; -import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient; +import nu.marginalia.wmsa.edge.search.svc.EdgeSearchDomainSearchService; +import nu.marginalia.wmsa.edge.search.svc.EdgeSearchQueryIndexService; +import nu.marginalia.wmsa.edge.search.svc.EdgeSearchUnitConversionService; +import nu.marginalia.wmsa.edge.search.svc.EdgeSearchWikiArticlesService; import org.apache.logging.log4j.util.Strings; -import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,7 +27,6 @@ import javax.annotation.Nullable; import java.util.*; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; -import java.util.regex.Pattern; import java.util.stream.Collectors; @Singleton @@ -41,32 +34,33 @@ public class EdgeSearchOperator { private static final Logger logger = LoggerFactory.getLogger(EdgeSearchOperator.class); private final AssistantClient assistantClient; - private final EncyclopediaClient encyclopediaClient; private final EdgeDataStoreDao edgeDataStoreDao; - private final EdgeIndexClient indexClient; private final QueryFactory queryFactory; - private final SearchResultDecorator resultDecorator; - private final Comparator resultListComparator; + + private final EdgeSearchQueryIndexService searchQueryService; + private final EdgeSearchDomainSearchService domainSearchService; + private final EdgeSearchWikiArticlesService wikiArticlesService; + private final EdgeSearchUnitConversionService edgeSearchUnitConversionService; + @Inject public EdgeSearchOperator(AssistantClient assistantClient, - EncyclopediaClient encyclopediaClient, EdgeDataStoreDao edgeDataStoreDao, - EdgeIndexClient indexClient, QueryFactory queryFactory, - SearchResultDecorator resultDecorator - ) { + + EdgeSearchQueryIndexService searchQueryService, + EdgeSearchDomainSearchService domainSearchService, + EdgeSearchWikiArticlesService wikiArticlesService, + EdgeSearchUnitConversionService edgeSearchUnitConversionService) { this.assistantClient = assistantClient; - this.encyclopediaClient = encyclopediaClient; this.edgeDataStoreDao = edgeDataStoreDao; - this.indexClient = indexClient; this.queryFactory = queryFactory; - this.resultDecorator = resultDecorator; - Comparator c = Comparator.comparing(ud -> Math.round(10*(ud.getTermScore() - ud.rankingIdAdjustment()))); - resultListComparator = c.thenComparing(EdgeUrlDetails::getRanking) - .thenComparing(EdgeUrlDetails::getId); + this.searchQueryService = searchQueryService; + this.domainSearchService = domainSearchService; + this.wikiArticlesService = wikiArticlesService; + this.edgeSearchUnitConversionService = edgeSearchUnitConversionService; } public List doApiSearch(Context ctx, @@ -77,22 +71,22 @@ public class EdgeSearchOperator { logger.info("Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ',')); - return performQuery(ctx, processedQuery); + return searchQueryService.performQuery(ctx, processedQuery); } - public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params, @Nullable Future eval) { - - Observable definitions = getWikiArticle(ctx, params.humanQuery()); + public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params) { + Future definitions = wikiArticlesService.getWikiArticle(ctx, params.humanQuery()); + Future eval = edgeSearchUnitConversionService.tryEval(ctx, params.humanQuery()); EdgeSearchQuery processedQuery = queryFactory.createQuery(params); logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ',')); - List queryResults = performQuery(ctx, processedQuery); + List queryResults = searchQueryService.performQuery(ctx, processedQuery); + List domainResults = domainSearchService.getDomainResults(ctx, processedQuery.specs); - String evalResult = getEvalResult(eval); - List domainResults = getDomainResults(ctx, processedQuery.specs); - WikiArticles wikiArticles = definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(); + String evalResult = getFutureOrDefault(eval, ""); + WikiArticles wikiArticles = getFutureOrDefault(definitions, new WikiArticles()); return DecoratedSearchResults.builder() .params(params) @@ -106,49 +100,16 @@ public class EdgeSearchOperator { .build(); } - private List getDomainResults(Context ctx, EdgeSearchSpecification specs) { - - List keywords = specs.subqueries.stream() - .filter(sq -> sq.searchTermsExclude.isEmpty() && sq.searchTermsInclude.size() == 1) - .map(sq -> sq.searchTermsInclude.get(0)) - .distinct() - .toList(); - - if (keywords.isEmpty()) - return Collections.emptyList(); - - List requests = new ArrayList<>(keywords.size() * specs.buckets.size()); - - for (var keyword : keywords) { - for (var bucket : specs.buckets) { - requests.add(new EdgeDomainSearchSpecification(bucket, IndexBlock.Link, keyword, - 1_000_000, 3, 25)); - } - } - - EdgeIdSet dedup = new EdgeIdSet<>(); - EdgeIdList values = new EdgeIdList<>(); - - for (var result : indexClient.queryDomains(ctx, requests)) { - for (int id : result.getResults().values()) { - if (dedup.add(id)) - values.add(id); - } - } - - return edgeDataStoreDao.getBrowseResultFromUrlIds(values); - } - - private String getEvalResult(@Nullable Future eval) { - if (eval == null || eval.isCancelled()) { - return ""; + private T getFutureOrDefault(@Nullable Future fut, T defaultValue) { + if (fut == null || fut.isCancelled()) { + return defaultValue; } try { - return eval.get(50, TimeUnit.MILLISECONDS); + return fut.get(50, TimeUnit.MILLISECONDS); } catch (Exception ex) { logger.warn("Error fetching eval result", ex); - return ""; + return defaultValue; } } @@ -165,50 +126,6 @@ public class EdgeSearchOperator { return domainId; } - public List performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) { - List sqs = new ArrayList<>(); - - sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block)); - - EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", false); - - return performQuery(ctx, new EdgeSearchQuery(specs)); - } - - private List performQuery(Context ctx, EdgeSearchQuery processedQuery) { - - final List results = indexClient.query(ctx, processedQuery.specs); - - final List resultList = new ArrayList<>(results.size()); - - for (var details : resultDecorator.getAllUrlDetails(results)) { - if (details.getUrlQuality() <= -100) { - continue; - } - - details = details.withUrlQualityAdjustment( - adjustScoreBasedOnQuery(details, processedQuery.specs)); - - resultList.add(details); - } - - resultList.sort(resultListComparator); - - UrlDeduplicator deduplicator = new UrlDeduplicator(processedQuery.specs.limitByDomain); - List retList = new ArrayList<>(processedQuery.specs.limitTotal); - - for (var item : resultList) { - if (retList.size() >= processedQuery.specs.limitTotal) - break; - - if (!deduplicator.shouldRemove(item)) { - retList.add(item); - } - } - - return retList; - } - private List getProblems(Context ctx, String evalResult, List queryResults, EdgeSearchQuery processedQuery) { final List problems = new ArrayList<>(processedQuery.problems); boolean siteSearch = processedQuery.domain != null; @@ -233,76 +150,6 @@ public class EdgeSearchOperator { } - private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}"); - - private EdgePageScoreAdjustment adjustScoreBasedOnQuery(EdgeUrlDetails p, EdgeSearchSpecification specs) { - String titleLC = p.title == null ? "" : p.title.toLowerCase(); - String descLC = p.description == null ? "" : p.description.toLowerCase(); - String urlLC = p.url == null ? "" : p.url.path.toLowerCase(); - String domainLC = p.url == null ? "" : p.url.domain.toString().toLowerCase(); - - String[] searchTermsLC = specs.subqueries.get(0).searchTermsInclude.stream() - .map(String::toLowerCase) - .flatMap(s -> Arrays.stream(s.split("_"))) - .toArray(String[]::new); - int termCount = searchTermsLC.length; - - double titleHitsAdj = 0.; - final String[] titleParts = titleSplitPattern.split(titleLC); - for (String titlePart : titleParts) { - double hits = 0; - for (String term : searchTermsLC) { - if (titlePart.contains(term)) { - hits += term.length(); - } - } - titleHitsAdj += hits / Math.max(1, titlePart.length()); - } - - double titleFullHit = 0.; - if (termCount > 1 && titleLC.contains(specs.humanQuery.replaceAll("\"", "").toLowerCase())) { - titleFullHit = termCount; - } - long descHits = Arrays.stream(searchTermsLC).filter(descLC::contains).count(); - long urlHits = Arrays.stream(searchTermsLC).filter(urlLC::contains).count(); - long domainHits = Arrays.stream(searchTermsLC).filter(domainLC::contains).count(); - - double descHitsAdj = 0.; - for (String word : descLC.split("[^\\w]+")) { - descHitsAdj += Arrays.stream(searchTermsLC) - .filter(term -> term.length() > word.length()) - .filter(term -> term.contains(word)) - .mapToDouble(term -> word.length() / (double) term.length()) - .sum(); - } - - return EdgePageScoreAdjustment.builder() - .descAdj(Math.min(termCount, descHits) / (10. * termCount)) - .descHitsAdj(descHitsAdj / 10.) - .domainAdj(2 * Math.min(termCount, domainHits) / (double) termCount) - .urlAdj(Math.min(termCount, urlHits) / (10. * termCount)) - .titleAdj(5 * titleHitsAdj / (Math.max(1, titleParts.length) * Math.log(titleLC.length() + 2))) - .titleFullHit(titleFullHit) - .build(); - } - - @NotNull - private Observable getWikiArticle(Context ctx, String humanQuery) { - - if (!encyclopediaClient.isAlive()) { - return Observable.just(new WikiArticles()); - } - - return encyclopediaClient - .encyclopediaLookup(ctx, - humanQuery.replaceAll("\\s+", "_") - .replaceAll("\"", "") - ) - .subscribeOn(Schedulers.io()) - .onErrorReturn(e -> new WikiArticles()) - ; - } - private Iterable spellCheckTerms(Context ctx, EdgeSearchQuery disjointedQuery) { return Observable.fromIterable(disjointedQuery.searchTermsHuman) .subscribeOn(Schedulers.io()) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java index f9ae421a..6d5aa46c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java @@ -17,7 +17,7 @@ public class CommandEvaluator { BrowseCommand browse, ConvertCommand convert, DefinitionCommand define, - SiteSearchCommand site, + SiteListCommand site, BangCommand bang, SearchCommand search ) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java index 7f03c0fc..933d93e1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java @@ -2,9 +2,9 @@ package nu.marginalia.wmsa.edge.search.command.commands; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.search.UnitConversion; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; +import nu.marginalia.wmsa.edge.search.svc.EdgeSearchUnitConversionService; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; import nu.marginalia.wmsa.renderer.mustache.RendererFactory; @@ -13,19 +13,19 @@ import java.util.Map; import java.util.Optional; public class ConvertCommand implements SearchCommandInterface { - private final UnitConversion unitConversion; + private final EdgeSearchUnitConversionService edgeSearchUnitConversionService; private final MustacheRenderer> conversionRenderer; @Inject - public ConvertCommand(UnitConversion unitConversion, RendererFactory rendererFactory) throws IOException { - this.unitConversion = unitConversion; + public ConvertCommand(EdgeSearchUnitConversionService edgeSearchUnitConversionService, RendererFactory rendererFactory) throws IOException { + this.edgeSearchUnitConversionService = edgeSearchUnitConversionService; conversionRenderer = rendererFactory.renderer("edge/conversion-results"); } @Override public Optional process(Context ctx, SearchParameters parameters, String query) { - var conversion = unitConversion.tryConversion(ctx, query); + var conversion = edgeSearchUnitConversionService.tryConversion(ctx, query); if (conversion.isEmpty()) { return Optional.empty(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java index fcd24fa0..c75c0699 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java @@ -5,25 +5,23 @@ import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; -import nu.marginalia.wmsa.edge.search.UnitConversion; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner; +import nu.marginalia.wmsa.edge.search.svc.EdgeSearchUnitConversionService; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; import nu.marginalia.wmsa.renderer.mustache.RendererFactory; -import javax.annotation.CheckForNull; import java.io.IOException; import java.util.Optional; -import java.util.concurrent.Future; public class SearchCommand implements SearchCommandInterface { private final EdgeDomainBlacklist blacklist; private final EdgeDataStoreDao dataStoreDao; private final EdgeSearchOperator searchOperator; - private final UnitConversion unitConversion; + private final EdgeSearchUnitConversionService edgeSearchUnitConversionService; private final MustacheRenderer searchResultsRenderer; private BrowseResultCleaner browseResultCleaner; @@ -33,14 +31,14 @@ public class SearchCommand implements SearchCommandInterface { public SearchCommand(EdgeDomainBlacklist blacklist, EdgeDataStoreDao dataStoreDao, EdgeSearchOperator searchOperator, - UnitConversion unitConversion, + EdgeSearchUnitConversionService edgeSearchUnitConversionService, RendererFactory rendererFactory, BrowseResultCleaner browseResultCleaner ) throws IOException { this.blacklist = blacklist; this.dataStoreDao = dataStoreDao; this.searchOperator = searchOperator; - this.unitConversion = unitConversion; + this.edgeSearchUnitConversionService = edgeSearchUnitConversionService; this.browseResultCleaner = browseResultCleaner; searchResultsRenderer = rendererFactory.renderer("edge/search-results"); @@ -48,11 +46,9 @@ public class SearchCommand implements SearchCommandInterface { @Override public Optional process(Context ctx, SearchParameters parameters, String query) { - @CheckForNull - Future eval = unitConversion.tryEval(ctx, query); EdgeUserSearchParameters params = new EdgeUserSearchParameters(query, parameters.profile(), parameters.js()); - DecoratedSearchResults results = searchOperator.doSearch(ctx, params, eval); + DecoratedSearchResults results = searchOperator.doSearch(ctx, params); results.results.removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain))); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java index 56ddcc7f..6772c4cb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java @@ -4,14 +4,13 @@ import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; -import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.model.DomainInformation; import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService; +import nu.marginalia.wmsa.edge.search.svc.EdgeSearchQueryIndexService; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; import nu.marginalia.wmsa.renderer.mustache.RendererFactory; import org.slf4j.Logger; @@ -23,28 +22,29 @@ import java.util.*; import java.util.function.Predicate; import java.util.regex.Pattern; -public class SiteSearchCommand implements SearchCommandInterface { +public class SiteListCommand implements SearchCommandInterface { private final EdgeDataStoreDao dataStoreDao; - private final EdgeSearchOperator searchOperator; private final DomainInformationService domainInformationService; + private final EdgeSearchQueryIndexService searchQueryIndexService; private final Logger logger = LoggerFactory.getLogger(getClass()); private final MustacheRenderer siteInfoRenderer; private final Predicate queryPatternPredicate = Pattern.compile("^site:[.A-Za-z\\-0-9]+$").asPredicate(); + @Inject - public SiteSearchCommand( + public SiteListCommand( DomainInformationService domainInformationService, EdgeDataStoreDao dataStoreDao, RendererFactory rendererFactory, - EdgeSearchOperator searchOperator) + EdgeSearchQueryIndexService searchQueryIndexService) throws IOException { this.dataStoreDao = dataStoreDao; - this.searchOperator = searchOperator; this.domainInformationService = domainInformationService; siteInfoRenderer = rendererFactory.renderer("edge/site-info"); + this.searchQueryIndexService = searchQueryIndexService; } @Override @@ -59,7 +59,7 @@ public class SiteSearchCommand implements SearchCommandInterface { List resultSet; Path screenshotPath = null; if (null != domain) { - resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain); + resultSet = searchQueryIndexService.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain); screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id()); } @@ -76,8 +76,7 @@ public class SiteSearchCommand implements SearchCommandInterface { String word = humanQuery.substring(definePrefix.length()).toLowerCase(); logger.info("Fetching Site Info: {}", word); - var results = domainInformationService.domainInfo(word) - .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); + var results = domainInformationService.domainInfo(word).orElseGet(DomainInformation::new); logger.debug("Results = {}", results); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java index d94ae487..862df844 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java @@ -2,13 +2,15 @@ package nu.marginalia.wmsa.edge.search.model; import lombok.AllArgsConstructor; import lombok.Getter; +import lombok.NoArgsConstructor; import lombok.ToString; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import java.util.List; -@Getter @AllArgsConstructor @ToString +@Getter @AllArgsConstructor @NoArgsConstructor +@ToString public class DomainInformation { EdgeDomain domain; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java index 792ab765..98057fd7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java @@ -239,10 +239,6 @@ public class QueryParser { returnValue.add(r); } - for (var qv : returnValue) { - System.out.println(qv); - } - return returnValue; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java new file mode 100644 index 00000000..5da50911 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java @@ -0,0 +1,64 @@ +package nu.marginalia.wmsa.edge.search.svc; + +import com.google.inject.Inject; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.id.EdgeIdList; +import nu.marginalia.wmsa.edge.model.id.EdgeIdSet; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; +import nu.marginalia.wmsa.edge.search.model.BrowseResult; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class EdgeSearchDomainSearchService { + + private final EdgeIndexClient indexClient; + private final EdgeDataStoreDao edgeDataStoreDao; + + @Inject + public EdgeSearchDomainSearchService(EdgeIndexClient indexClient, EdgeDataStoreDao edgeDataStoreDao) { + this.indexClient = indexClient; + this.edgeDataStoreDao = edgeDataStoreDao; + } + + public List getDomainResults(Context ctx, EdgeSearchSpecification specs) { + + List keywords = specs.subqueries.stream() + .filter(sq -> sq.searchTermsExclude.isEmpty() && sq.searchTermsInclude.size() == 1) + .map(sq -> sq.searchTermsInclude.get(0)) + .distinct() + .toList(); + + if (keywords.isEmpty()) + return Collections.emptyList(); + + List requests = new ArrayList<>(keywords.size() * specs.buckets.size()); + + for (var keyword : keywords) { + for (var bucket : specs.buckets) { + requests.add(new EdgeDomainSearchSpecification(bucket, IndexBlock.Link, keyword, + 1_000_000, 3, 25)); + } + } + + EdgeIdSet dedup = new EdgeIdSet<>(); + EdgeIdList values = new EdgeIdList<>(); + + for (var result : indexClient.queryDomains(ctx, requests)) { + for (int id : result.getResults().values()) { + if (dedup.add(id)) + values.add(id); + } + } + + return edgeDataStoreDao.getBrowseResultFromUrlIds(values); + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java new file mode 100644 index 00000000..1d3d5649 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java @@ -0,0 +1,130 @@ +package nu.marginalia.wmsa.edge.search.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.search.*; +import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; +import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; +import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator; +import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator; + +import java.util.*; +import java.util.regex.Pattern; + +@Singleton +public class EdgeSearchQueryIndexService { + + private final SearchResultDecorator resultDecorator; + private final Comparator resultListComparator; + private final EdgeIndexClient indexClient; + @Inject + public EdgeSearchQueryIndexService(SearchResultDecorator resultDecorator, EdgeIndexClient indexClient) { + this.resultDecorator = resultDecorator; + this.indexClient = indexClient; + + Comparator c = Comparator.comparing(ud -> Math.round(10*(ud.getTermScore() - ud.rankingIdAdjustment()))); + resultListComparator = c.thenComparing(EdgeUrlDetails::getRanking) + .thenComparing(EdgeUrlDetails::getId); + } + + public List performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) { + List sqs = new ArrayList<>(); + + sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block)); + + EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", false); + + return performQuery(ctx, new EdgeSearchQuery(specs)); + } + + public List performQuery(Context ctx, EdgeSearchQuery processedQuery) { + + final List results = indexClient.query(ctx, processedQuery.specs); + + final List resultList = new ArrayList<>(results.size()); + + for (var details : resultDecorator.getAllUrlDetails(results)) { + if (details.getUrlQuality() <= -100) { + continue; + } + + details = details.withUrlQualityAdjustment( + adjustScoreBasedOnQuery(details, processedQuery.specs)); + + resultList.add(details); + } + + resultList.sort(resultListComparator); + + UrlDeduplicator deduplicator = new UrlDeduplicator(processedQuery.specs.limitByDomain); + List retList = new ArrayList<>(processedQuery.specs.limitTotal); + + for (var item : resultList) { + if (retList.size() >= processedQuery.specs.limitTotal) + break; + + if (!deduplicator.shouldRemove(item)) { + retList.add(item); + } + } + + return retList; + } + + private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}"); + + private EdgePageScoreAdjustment adjustScoreBasedOnQuery(EdgeUrlDetails p, EdgeSearchSpecification specs) { + String titleLC = p.title == null ? "" : p.title.toLowerCase(); + String descLC = p.description == null ? "" : p.description.toLowerCase(); + String urlLC = p.url == null ? "" : p.url.path.toLowerCase(); + String domainLC = p.url == null ? "" : p.url.domain.toString().toLowerCase(); + + String[] searchTermsLC = specs.subqueries.get(0).searchTermsInclude.stream() + .map(String::toLowerCase) + .flatMap(s -> Arrays.stream(s.split("_"))) + .toArray(String[]::new); + int termCount = searchTermsLC.length; + + double titleHitsAdj = 0.; + final String[] titleParts = titleSplitPattern.split(titleLC); + for (String titlePart : titleParts) { + double hits = 0; + for (String term : searchTermsLC) { + if (titlePart.contains(term)) { + hits += term.length(); + } + } + titleHitsAdj += hits / Math.max(1, titlePart.length()); + } + + double titleFullHit = 0.; + if (termCount > 1 && titleLC.contains(specs.humanQuery.replaceAll("\"", "").toLowerCase())) { + titleFullHit = termCount; + } + long descHits = Arrays.stream(searchTermsLC).filter(descLC::contains).count(); + long urlHits = Arrays.stream(searchTermsLC).filter(urlLC::contains).count(); + long domainHits = Arrays.stream(searchTermsLC).filter(domainLC::contains).count(); + + double descHitsAdj = 0.; + for (String word : descLC.split("[^\\w]+")) { + descHitsAdj += Arrays.stream(searchTermsLC) + .filter(term -> term.length() > word.length()) + .filter(term -> term.contains(word)) + .mapToDouble(term -> word.length() / (double) term.length()) + .sum(); + } + + return EdgePageScoreAdjustment.builder() + .descAdj(Math.min(termCount, descHits) / (10. * termCount)) + .descHitsAdj(descHitsAdj / 10.) + .domainAdj(2 * Math.min(termCount, domainHits) / (double) termCount) + .urlAdj(Math.min(termCount, urlHits) / (10. * termCount)) + .titleAdj(5 * titleHitsAdj / (Math.max(1, titleParts.length) * Math.log(titleLC.length() + 2))) + .titleFullHit(titleFullHit) + .build(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/UnitConversion.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchUnitConversionService.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/UnitConversion.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchUnitConversionService.java index 6738554b..491a1361 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/UnitConversion.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchUnitConversionService.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.search; +package nu.marginalia.wmsa.edge.search.svc; import nu.marginalia.wmsa.client.exception.RemoteException; import nu.marginalia.wmsa.configuration.server.Context; @@ -9,15 +9,13 @@ import org.slf4j.LoggerFactory; import javax.annotation.CheckForNull; import javax.inject.Inject; import javax.inject.Singleton; -import java.util.NoSuchElementException; import java.util.Optional; -import java.util.concurrent.CompletableFuture; import java.util.concurrent.Future; import java.util.function.Predicate; import java.util.regex.Pattern; @Singleton -public class UnitConversion { +public class EdgeSearchUnitConversionService { private final Logger logger = LoggerFactory.getLogger(getClass()); private final Pattern conversionPattern = Pattern.compile("((\\d+|\\s+|[.()\\-^+%*/]|log[^a-z]|log2[^a-z]|sqrt[^a-z]|log10|cos[^a-z]|sin[^a-z]|tan[^a-z]|log2|pi[^a-z]|e[^a-z]|2pi[^a-z])+)\\s*([a-zA-Z][a-zA-Z^.0-9]*\\s?[a-zA-Z^.0-9]*)\\s+in\\s+([a-zA-Z^.0-9]+\\s?[a-zA-Z^.0-9]*)"); private final Predicate evalPredicate = Pattern.compile("(\\d+|\\s+|[.()\\-^+%*/]|log|log2|sqrt|log10|cos|sin|tan|pi|e|2pi)+").asMatchPredicate(); @@ -25,7 +23,7 @@ public class UnitConversion { private final AssistantClient assistantClient; @Inject - public UnitConversion(AssistantClient assistantClient) { + public EdgeSearchUnitConversionService(AssistantClient assistantClient) { this.assistantClient = assistantClient; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchWikiArticlesService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchWikiArticlesService.java new file mode 100644 index 00000000..7a197763 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchWikiArticlesService.java @@ -0,0 +1,41 @@ +package nu.marginalia.wmsa.edge.search.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import io.reactivex.rxjava3.core.Observable; +import io.reactivex.rxjava3.schedulers.Schedulers; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; +import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient; +import org.jetbrains.annotations.NotNull; + +import java.util.concurrent.Future; + +@Singleton +public class EdgeSearchWikiArticlesService { + private final EncyclopediaClient encyclopediaClient; + + @Inject + public EdgeSearchWikiArticlesService(EncyclopediaClient encyclopediaClient) { + this.encyclopediaClient = encyclopediaClient; + } + + @NotNull + public Future getWikiArticle(Context ctx, String humanQuery) { + + if (!encyclopediaClient.isAlive()) { + return Observable.just(new WikiArticles()).toFuture(); + } + + return encyclopediaClient + .encyclopediaLookup(ctx, + humanQuery.replaceAll("\\s+", "_") + .replaceAll("\"", "") + ) + .subscribeOn(Schedulers.io()) + .onErrorReturn(e -> new WikiArticles()) + .toFuture() + ; + } + +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java index 7e1571f7..88d1232a 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java @@ -12,7 +12,7 @@ import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; import nu.marginalia.wmsa.edge.assistant.eval.MathParser; import nu.marginalia.wmsa.edge.assistant.eval.Units; import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; -import nu.marginalia.wmsa.edge.search.UnitConversion; +import nu.marginalia.wmsa.edge.search.svc.EdgeSearchUnitConversionService; import org.junit.jupiter.api.*; import org.junit.jupiter.api.parallel.Execution; import org.junit.jupiter.api.parallel.ExecutionMode; @@ -105,7 +105,7 @@ class AssistantTest { @Test public void testConvertUnitsWithParser() { - var conversion = new UnitConversion(client); + var conversion = new EdgeSearchUnitConversionService(client); assertEquals("0.3 m", conversion.tryConversion(Context.internal(), "30 cm in m").get()); assertEquals("500 m", conversion.tryConversion(Context.internal(), "0.5 km in m").get()); assertEquals("500 m", conversion.tryConversion(Context.internal(), "0.1+0.4 km in m").get()); @@ -125,7 +125,7 @@ class AssistantTest { @Test public void testEvalWithParser() throws ExecutionException, InterruptedException { - var conversion = new UnitConversion(client); + var conversion = new EdgeSearchUnitConversionService(client); assertEquals("305", conversion.tryEval(Context.internal(), "300+5").get()); assertEquals("1.772", conversion.tryEval(Context.internal(), "sqrt(pi)").get()); From 7749ce645ae9afa597c0e35493cbb3e156a5ced0 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 12 Sep 2022 10:39:02 +0200 Subject: [PATCH 19/19] Further more cleaning --- .../wmsa/edge/index/model/IndexBlock.java | 2 +- .../wmsa/edge/index/reader/SearchIndexReader.java | 5 +---- .../edge/index/svc/EdgeIndexLexiconService.java | 2 +- .../wmsa/edge/index/svc/EdgeIndexQueryService.java | 7 ++++++- .../edge/index/svc/query/IndexQueryFactory.java | 5 +---- .../wmsa/edge/index/svc/query/IndexQueryIf.java | 14 -------------- .../edge/index/svc/query/IndexSearchBudget.java | 7 +------ .../index/svc/query/ResultDomainDeduplicator.java | 9 +++++++-- .../wmsa/edge/search/EdgeSearchProfile.java | 4 ++-- 9 files changed, 20 insertions(+), 35 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index 1392e05e..67b5df80 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -8,8 +8,8 @@ public enum IndexBlock { Subjects(IndexBlockType.QUALITY_SIGNAL, 3, 1.0), NamesWords(IndexBlockType.QUALITY_SIGNAL, 4, 3.0), - Artifacts(IndexBlockType.QUALITY_SIGNAL, 5, 10), + Artifacts(IndexBlockType.PAGE_DATA, 5, 10), Meta(IndexBlockType.PAGE_DATA, 6, 7), Tfidf_Top(IndexBlockType.TF_IDF, 7, 1.5), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 5a796cce..bca63d6b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -18,7 +18,6 @@ import java.util.stream.Stream; public class SearchIndexReader implements AutoCloseable { private final EnumMap indices; - private final EnumMap queryBuilders; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -45,7 +44,6 @@ public class SearchIndexReader implements AutoCloseable { var topIndex = indices.get(IndexBlock.Tfidf_Top); var linkIndex = indices.get(IndexBlock.Link); var titleIndex = indices.get(IndexBlock.Title); - var namesIndex = indices.get(IndexBlock.NamesWords); var siteIndex = indices.get(IndexBlock.Site); var metaIndex = indices.get(IndexBlock.Meta); var topicIndex = indices.get(IndexBlock.Subjects); @@ -60,10 +58,9 @@ public class SearchIndexReader implements AutoCloseable { queryBuilders = new EnumMap<>(IndexBlock.class); List excludeIndices = listOfNonNulls(metaIndex, titleIndex, topIndex, midIndex, lowIndex, words1); - List priorityIndices = listOfNonNulls(titleIndex, linkIndex, siteIndex, topIndex); + List priorityIndices = listOfNonNulls(titleIndex, linkIndex, siteIndex, topIndex, topicIndex); queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices, priorityIndices)); - queryBuilders.put(IndexBlock.Tfidf_Lower, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex, namesIndex, topIndex, siteIndex, midIndex, lowIndex, topicIndex, artifacts), excludeIndices, priorityIndices)); queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices, priorityIndices)); queryBuilders.put(IndexBlock.Words_2, new IndexQueryFactory(listOfNonNulls(metaIndex, words2), excludeIndices, priorityIndices)); queryBuilders.put(IndexBlock.Words_4, new IndexQueryFactory(listOfNonNulls(metaIndex, words4), excludeIndices, priorityIndices)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java index 6cabbd9d..a942d892 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java @@ -83,7 +83,7 @@ public class EdgeIndexLexiconService { var header = new SearchIndexJournalEntryHeader(domainId, urlId, block); indexWriter.put(header, entry); - }; + } } private long[] getOrInsertWordIds(List words) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java index a410159f..64e22c26 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -33,6 +33,7 @@ import spark.Spark; import java.util.*; import java.util.function.LongPredicate; +import static java.util.Comparator.comparing; import static spark.Spark.halt; @Singleton @@ -163,7 +164,11 @@ public class EdgeIndexQueryService { cachePool.clear(); return results.stream() - .sorted(Comparator.comparing(EdgeSearchResultItem::getScore)) + .sorted( + comparing(EdgeSearchResultItem::getScore) + .thenComparing(EdgeSearchResultItem::getRanking) + .thenComparing(EdgeSearchResultItem::getUrlIdInt) + ) .filter(domainCountFilter::test) .limit(specsSet.getLimitTotal()).toList(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java index 5bd31122..9de08040 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java @@ -35,10 +35,7 @@ public class IndexQueryFactory { } } - - IndexQuery query = new IndexQuery(sources); - - return new IndexQueryBuilder(query, cachePool); + return new IndexQueryBuilder(new IndexQuery(sources), cachePool); } public class IndexQueryBuilder { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java index 82e951c4..b07515ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java @@ -3,20 +3,6 @@ package nu.marginalia.wmsa.edge.index.svc.query; import java.util.stream.LongStream; public interface IndexQueryIf { - IndexQueryIf EMPTY = new IndexQueryIf() { - @Override - public IndexQueryIf also(int wordId) { return this; } - - @Override - public IndexQueryIf alsoCached(int wordId) { return this; } - - @Override - public IndexQueryIf not(int wordId) { return this; } - - @Override - public LongStream stream() { return LongStream.empty(); } - }; - IndexQueryIf also(int wordId); IndexQueryIf alsoCached(int wordId); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java index 24d28594..b6229bd3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java @@ -2,16 +2,11 @@ package nu.marginalia.wmsa.edge.index.svc.query; public class IndexSearchBudget { - private long timeout; + private final long timeout; public IndexSearchBudget(long limitTime) { this.timeout = System.currentTimeMillis() + limitTime; } - // Used for short-circuiting Stream-objects using takeWhile, we don't care - public boolean take(long unused) { - return hasTimeLeft(); - } public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java index 24922eb7..9b1ca5e1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java @@ -34,12 +34,17 @@ public class ResultDomainDeduplicator { return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain; } + public boolean test(EdgeSearchResultItem item) { - int ranking = item.getRanking(); + final int ranking = item.getRanking(); if (ranking == Integer.MAX_VALUE) { return true; } - return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain; + // For ResultItems, consider bucketId as well as different buckets may use different + // ranking algorithms + final long key = ranking*32L + item.bucketId; + + return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index 65b7f999..99601965 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -62,6 +62,6 @@ public enum EdgeSearchProfile { } class SearchOrder { - static List DEFAULT_ORDER = List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); + static List DEFAULT_ORDER + = List.of(IndexBlock.Title, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); } \ No newline at end of file