From 9ece07d559e29a9321194f74673df6823995f841 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 9 Mar 2023 17:52:35 +0100 Subject: [PATCH] Chasing a result ranking bug --- .../nu/marginalia/model/crawl/PubDate.java | 3 + .../nu/marginalia/model/idx/WordMetadata.java | 15 ++- .../keywords/DocumentKeywordExtractor.java | 23 +---- .../language-processing/build.gradle | 1 + .../language/model/KeywordMetadata.java | 84 ++++++++++++++--- .../valuation/SearchResultValuator.java | 20 +--- .../valuation/SearchResultValuatorTest.java | 91 +++++++++++++++++++ 7 files changed, 178 insertions(+), 59 deletions(-) create mode 100644 code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/PubDate.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/PubDate.java index 67f67105..de621ce7 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/PubDate.java +++ b/code/common/model/src/main/java/nu/marginalia/model/crawl/PubDate.java @@ -57,5 +57,8 @@ public record PubDate(String dateIso8601, int year) { public static int fromYearByte(int yearByte) { return yearByte + ENCODING_OFFSET; } + public static int toYearByte(int year) { + return Math.max(0, year - ENCODING_OFFSET); + } } diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java index 95c2c407..7aba4d43 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java @@ -15,8 +15,9 @@ public record WordMetadata(int tfIdf, byte flags) { public WordMetadata { if (WordMetadata.class.desiredAssertionStatus()) { - // invariant checks go here - assert(Integer.bitCount(positions) <= count); + if (Integer.bitCount(positions) > count) { + System.err.println(Integer.bitCount(positions) + ">" + count); + } } } @@ -27,6 +28,10 @@ public record WordMetadata(int tfIdf, public static final int TF_IDF_SHIFT = 16; public static final int POSITIONS_SHIFT = 32; + public static final long POSITIONS_MASK = 0xFFFF_FFFFL; + + public static final long FLAGS_MASK = 0xFF; + public WordMetadata() { this(emptyValue()); @@ -35,9 +40,9 @@ public record WordMetadata(int tfIdf, public WordMetadata(long value) { this( (int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK), - (int)(value >>> POSITIONS_SHIFT), - (int)((value >>> COUNT_SHIFT) & COUNT_MASK), - (byte) (value & 0xFF) + (int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK), + Math.max((int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK), (int)((value >>> COUNT_SHIFT) & COUNT_MASK)), + (byte) (value & FLAGS_MASK) ); } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java index 90084f49..cc0bcb78 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.keywords; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import nu.marginalia.language.WordPatterns; import nu.marginalia.language.encoding.AsciiFlattener; import nu.marginalia.language.keywords.KeywordExtractor; @@ -33,26 +34,6 @@ public class DocumentKeywordExtractor { } - public DocumentKeywordsBuilder extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { - - List titleWords = extractTitleWords(documentLanguageData); - List wordsNamesAll = nameCounter.count(documentLanguageData, 2); - List subjects = subjectCounter.count(keywordMetadata, documentLanguageData); - - for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed); - for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed); - for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed); - - List artifacts = getArtifacts(documentLanguageData); - - FilteringDocumentKeywordsBuilder wordsBuilder = new FilteringDocumentKeywordsBuilder(); - - createWords(wordsBuilder, keywordMetadata, titleWords, 0); - artifacts.forEach(wordsBuilder::addWithBlankMetadata); - - return wordsBuilder.build(); - } - public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { List titleWords = extractTitleWords(documentLanguageData); @@ -86,7 +67,7 @@ public class DocumentKeywordExtractor { public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { - Map ret = keywordMetadata.positionMask(); + Object2IntOpenHashMap ret = keywordMetadata.positionMask(); for (var sent : dld.titleSentences) { int posBit = 1; diff --git a/code/libraries/language-processing/build.gradle b/code/libraries/language-processing/build.gradle index 78b686a5..dfcec644 100644 --- a/code/libraries/language-processing/build.gradle +++ b/code/libraries/language-processing/build.gradle @@ -28,6 +28,7 @@ dependencies { implementation libs.guice implementation libs.jsoup implementation libs.trove + implementation libs.fastutil implementation libs.bundles.nlp implementation libs.commons.lang3 diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java index 914d3e6c..c18fb5da 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java @@ -1,33 +1,34 @@ package nu.marginalia.language.model; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.crawl.EdgePageWordFlags; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; +import java.util.Objects; -public record KeywordMetadata(HashSet titleKeywords, - HashSet subjectKeywords, - HashSet namesKeywords, - HashMap wordsTfIdf, - HashMap positionMask, - EnumSet wordFlagsTemplate -) -{ +public final class KeywordMetadata { + + private static final WordFrequencyData empty = new WordFrequencyData(0, 0); + private final HashSet titleKeywords = new HashSet<>(50); + private final HashSet subjectKeywords = new HashSet<>(10); + private final HashSet namesKeywords = new HashSet<>(50); + private final HashMap wordsTfIdf; + private final Object2IntOpenHashMap positionMask; + private final EnumSet wordFlagsTemplate; public KeywordMetadata(EnumSet flags) { - this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50), - new HashMap<>(15_000), - new HashMap<>(10_000), - flags); + this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f); + this.wordsTfIdf = new HashMap<>(10_000); + this.wordFlagsTemplate = flags; } public KeywordMetadata() { this(EnumSet.noneOf(EdgePageWordFlags.class)); } - private static final WordFrequencyData empty = new WordFrequencyData(0, 0); public long getMetadataForWord(EnumSet flagsTemplate, String stemmed) { WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty); @@ -43,8 +44,63 @@ public record KeywordMetadata(HashSet titleKeywords, flags.add(EdgePageWordFlags.Title); int positions = positionMask.getOrDefault(stemmed, 0); + int count = Math.max(Integer.bitCount(positions), tfidf.count()); - return new WordMetadata(tfidf.tfIdfNormalized(), positions, tfidf.count(), flags).encode(); + return new WordMetadata(tfidf.tfIdfNormalized(), positions, count, flags).encode(); } + public HashSet titleKeywords() { + return titleKeywords; + } + + public HashSet subjectKeywords() { + return subjectKeywords; + } + + public HashSet namesKeywords() { + return namesKeywords; + } + + public HashMap wordsTfIdf() { + return wordsTfIdf; + } + + public Object2IntOpenHashMap positionMask() { + return positionMask; + } + + public EnumSet wordFlagsTemplate() { + return wordFlagsTemplate; + } + + @Override + public boolean equals(Object obj) { + if (obj == this) return true; + if (obj == null || obj.getClass() != this.getClass()) return false; + var that = (KeywordMetadata) obj; + return Objects.equals(this.titleKeywords, that.titleKeywords) && + Objects.equals(this.subjectKeywords, that.subjectKeywords) && + Objects.equals(this.namesKeywords, that.namesKeywords) && + Objects.equals(this.wordsTfIdf, that.wordsTfIdf) && + Objects.equals(this.positionMask, that.positionMask) && + Objects.equals(this.wordFlagsTemplate, that.wordFlagsTemplate); + } + + @Override + public int hashCode() { + return Objects.hash(titleKeywords, subjectKeywords, namesKeywords, wordsTfIdf, positionMask, wordFlagsTemplate); + } + + @Override + public String toString() { + return "KeywordMetadata[" + + "titleKeywords=" + titleKeywords + ", " + + "subjectKeywords=" + subjectKeywords + ", " + + "namesKeywords=" + namesKeywords + ", " + + "wordsTfIdf=" + wordsTfIdf + ", " + + "positionMask=" + positionMask + ", " + + "wordFlagsTemplate=" + wordFlagsTemplate + ']'; + } + + } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java index 6e57f4da..3cf1bfbb 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java @@ -78,9 +78,8 @@ public class SearchResultValuator { continue; final double bm25Factor = getBM25(keywordSet, length); - final double minCountFactor = getMinCountFactor(keywordSet); - bestScore = min(bestScore, bm25Factor * minCountFactor); + bestScore = min(bestScore, bm25Factor); bestAllTermsFactor = min(bestAllTermsFactor, getAllTermsFactorForSet(keywordSet, titleLength)); @@ -96,23 +95,6 @@ public class SearchResultValuator { .orElse(false); } - private double getMinCountFactor(SearchResultsKeywordSet keywordSet) { - // Penalize results with few keyword hits - - int min = 32; - - for (var keyword : keywordSet) { - if (!keyword.wordMetadata.hasFlag(EdgePageWordFlags.Title) && keyword.score.isRegular()) { - min = min(min, keyword.count()); - } - } - - if (min <= 1) return 2; - if (min <= 2) return 1.5; - if (min <= 3) return 1.25; - return 1; - } - private double getBM25(SearchResultsKeywordSet keywordSet, int length) { final double scalingFactor = 750.; diff --git a/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java b/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java new file mode 100644 index 00000000..ec23af7f --- /dev/null +++ b/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java @@ -0,0 +1,91 @@ +package nu.marginalia.search.valuation; + +import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore; +import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordMetadata; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.util.EnumSet; +import java.util.List; +import java.util.Set; + +import static org.mockito.Mockito.when; + +class SearchResultValuatorTest { + + TermFrequencyDict dict; + SearchResultValuator valuator; + + @BeforeEach + public void setUp() { + + dict = Mockito.mock(TermFrequencyDict.class); + when(dict.docCount()).thenReturn(100_000); + + valuator = new SearchResultValuator(dict); + + } + List titleOnlyLowCountSet = List.of( + new EdgeSearchResultKeywordScore(0, "bob", + wordMetadata(32, Set.of(1), EnumSet.of(EdgePageWordFlags.Title)), + docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), + false) + ); + List highCountNoTitleSet = List.of( + new EdgeSearchResultKeywordScore(0, "bob", + wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh)), + docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), + false) + ); + + List highCountSubjectSet = List.of( + new EdgeSearchResultKeywordScore(0, "bob", + wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh, EdgePageWordFlags.Subjects)), + docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), + false) + ); + + + List first = List.of( + new EdgeSearchResultKeywordScore(0, "bob", + wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(EdgePageWordFlags.TfIdfHigh)), + docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), + false) + ); + + @Test + void evaluateTerms() { + + when(dict.getTermFreq("bob")).thenReturn(10L); + + double titleOnlyLowCount = valuator.evaluateTerms(titleOnlyLowCountSet, 10_000, 32); + double titleLongOnlyLowCount = valuator.evaluateTerms(titleOnlyLowCountSet, 10_000, 72); + double highCountNoTitle = valuator.evaluateTerms(highCountNoTitleSet, 10_000, 32); + double highCountSubject = valuator.evaluateTerms(highCountSubjectSet, 10_000, 32); + + System.out.println(titleOnlyLowCount); + System.out.println(titleLongOnlyLowCount); + System.out.println(highCountNoTitle); + System.out.println(highCountSubject); + } + + private long docMetadata(int topology, int year, int sets, int quality, EnumSet flags) { + return new DocumentMetadata(topology, PubDate.toYearByte(year), sets, quality, flags).encode(); + } + + private long wordMetadata(int tfIdf, Set positions, Set wordFlags) { + int posBits = positions.stream() + .mapToInt(i -> (int)((1L << i) & 0xFFFF_FFFFL)) + .reduce((a,b) -> a|b) + .orElse(0); + + return new WordMetadata(tfIdf, posBits, positions.size(), wordFlags).encode(); + } + +} \ No newline at end of file