mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Chasing a result ranking bug
This commit is contained in:
parent
0ae4731cf1
commit
9ece07d559
@ -57,5 +57,8 @@ public record PubDate(String dateIso8601, int year) {
|
|||||||
public static int fromYearByte(int yearByte) {
|
public static int fromYearByte(int yearByte) {
|
||||||
return yearByte + ENCODING_OFFSET;
|
return yearByte + ENCODING_OFFSET;
|
||||||
}
|
}
|
||||||
|
public static int toYearByte(int year) {
|
||||||
|
return Math.max(0, year - ENCODING_OFFSET);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -15,8 +15,9 @@ public record WordMetadata(int tfIdf,
|
|||||||
byte flags) {
|
byte flags) {
|
||||||
public WordMetadata {
|
public WordMetadata {
|
||||||
if (WordMetadata.class.desiredAssertionStatus()) {
|
if (WordMetadata.class.desiredAssertionStatus()) {
|
||||||
// invariant checks go here
|
if (Integer.bitCount(positions) > count) {
|
||||||
assert(Integer.bitCount(positions) <= count);
|
System.err.println(Integer.bitCount(positions) + ">" + count);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -27,6 +28,10 @@ public record WordMetadata(int tfIdf,
|
|||||||
public static final int TF_IDF_SHIFT = 16;
|
public static final int TF_IDF_SHIFT = 16;
|
||||||
|
|
||||||
public static final int POSITIONS_SHIFT = 32;
|
public static final int POSITIONS_SHIFT = 32;
|
||||||
|
public static final long POSITIONS_MASK = 0xFFFF_FFFFL;
|
||||||
|
|
||||||
|
public static final long FLAGS_MASK = 0xFF;
|
||||||
|
|
||||||
|
|
||||||
public WordMetadata() {
|
public WordMetadata() {
|
||||||
this(emptyValue());
|
this(emptyValue());
|
||||||
@ -35,9 +40,9 @@ public record WordMetadata(int tfIdf,
|
|||||||
public WordMetadata(long value) {
|
public WordMetadata(long value) {
|
||||||
this(
|
this(
|
||||||
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
|
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
|
||||||
(int)(value >>> POSITIONS_SHIFT),
|
(int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
|
||||||
(int)((value >>> COUNT_SHIFT) & COUNT_MASK),
|
Math.max((int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK), (int)((value >>> COUNT_SHIFT) & COUNT_MASK)),
|
||||||
(byte) (value & 0xFF)
|
(byte) (value & FLAGS_MASK)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.converting.processor.keywords;
|
package nu.marginalia.converting.processor.keywords;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||||
import nu.marginalia.language.WordPatterns;
|
import nu.marginalia.language.WordPatterns;
|
||||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||||
@ -33,26 +34,6 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public DocumentKeywordsBuilder extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
|
||||||
|
|
||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
|
||||||
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
|
||||||
|
|
||||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
|
||||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
|
||||||
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
|
||||||
|
|
||||||
List<String> artifacts = getArtifacts(documentLanguageData);
|
|
||||||
|
|
||||||
FilteringDocumentKeywordsBuilder wordsBuilder = new FilteringDocumentKeywordsBuilder();
|
|
||||||
|
|
||||||
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
|
|
||||||
artifacts.forEach(wordsBuilder::addWithBlankMetadata);
|
|
||||||
|
|
||||||
return wordsBuilder.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||||
|
|
||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
@ -86,7 +67,7 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
|
|
||||||
public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||||
Map<String, Integer> ret = keywordMetadata.positionMask();
|
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask();
|
||||||
|
|
||||||
for (var sent : dld.titleSentences) {
|
for (var sent : dld.titleSentences) {
|
||||||
int posBit = 1;
|
int posBit = 1;
|
||||||
|
@ -28,6 +28,7 @@ dependencies {
|
|||||||
implementation libs.guice
|
implementation libs.guice
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.trove
|
implementation libs.trove
|
||||||
|
implementation libs.fastutil
|
||||||
|
|
||||||
implementation libs.bundles.nlp
|
implementation libs.bundles.nlp
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
|
@ -1,33 +1,34 @@
|
|||||||
package nu.marginalia.language.model;
|
package nu.marginalia.language.model;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
|
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public record KeywordMetadata(HashSet<String> titleKeywords,
|
public final class KeywordMetadata {
|
||||||
HashSet<String> subjectKeywords,
|
|
||||||
HashSet<String> namesKeywords,
|
private static final WordFrequencyData empty = new WordFrequencyData(0, 0);
|
||||||
HashMap<String, WordFrequencyData> wordsTfIdf,
|
private final HashSet<String> titleKeywords = new HashSet<>(50);
|
||||||
HashMap<String, Integer> positionMask,
|
private final HashSet<String> subjectKeywords = new HashSet<>(10);
|
||||||
EnumSet<EdgePageWordFlags> wordFlagsTemplate
|
private final HashSet<String> namesKeywords = new HashSet<>(50);
|
||||||
)
|
private final HashMap<String, WordFrequencyData> wordsTfIdf;
|
||||||
{
|
private final Object2IntOpenHashMap<String> positionMask;
|
||||||
|
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
|
||||||
|
|
||||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||||
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||||
new HashMap<>(15_000),
|
this.wordsTfIdf = new HashMap<>(10_000);
|
||||||
new HashMap<>(10_000),
|
this.wordFlagsTemplate = flags;
|
||||||
flags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public KeywordMetadata() {
|
public KeywordMetadata() {
|
||||||
this(EnumSet.noneOf(EdgePageWordFlags.class));
|
this(EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final WordFrequencyData empty = new WordFrequencyData(0, 0);
|
|
||||||
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||||
|
|
||||||
WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
||||||
@ -43,8 +44,63 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
|||||||
flags.add(EdgePageWordFlags.Title);
|
flags.add(EdgePageWordFlags.Title);
|
||||||
|
|
||||||
int positions = positionMask.getOrDefault(stemmed, 0);
|
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||||
|
int count = Math.max(Integer.bitCount(positions), tfidf.count());
|
||||||
|
|
||||||
return new WordMetadata(tfidf.tfIdfNormalized(), positions, tfidf.count(), flags).encode();
|
return new WordMetadata(tfidf.tfIdfNormalized(), positions, count, flags).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public HashSet<String> titleKeywords() {
|
||||||
|
return titleKeywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HashSet<String> subjectKeywords() {
|
||||||
|
return subjectKeywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HashSet<String> namesKeywords() {
|
||||||
|
return namesKeywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HashMap<String, WordFrequencyData> wordsTfIdf() {
|
||||||
|
return wordsTfIdf;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object2IntOpenHashMap<String> positionMask() {
|
||||||
|
return positionMask;
|
||||||
|
}
|
||||||
|
|
||||||
|
public EnumSet<EdgePageWordFlags> wordFlagsTemplate() {
|
||||||
|
return wordFlagsTemplate;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (obj == this) return true;
|
||||||
|
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||||
|
var that = (KeywordMetadata) obj;
|
||||||
|
return Objects.equals(this.titleKeywords, that.titleKeywords) &&
|
||||||
|
Objects.equals(this.subjectKeywords, that.subjectKeywords) &&
|
||||||
|
Objects.equals(this.namesKeywords, that.namesKeywords) &&
|
||||||
|
Objects.equals(this.wordsTfIdf, that.wordsTfIdf) &&
|
||||||
|
Objects.equals(this.positionMask, that.positionMask) &&
|
||||||
|
Objects.equals(this.wordFlagsTemplate, that.wordFlagsTemplate);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(titleKeywords, subjectKeywords, namesKeywords, wordsTfIdf, positionMask, wordFlagsTemplate);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "KeywordMetadata[" +
|
||||||
|
"titleKeywords=" + titleKeywords + ", " +
|
||||||
|
"subjectKeywords=" + subjectKeywords + ", " +
|
||||||
|
"namesKeywords=" + namesKeywords + ", " +
|
||||||
|
"wordsTfIdf=" + wordsTfIdf + ", " +
|
||||||
|
"positionMask=" + positionMask + ", " +
|
||||||
|
"wordFlagsTemplate=" + wordFlagsTemplate + ']';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -78,9 +78,8 @@ public class SearchResultValuator {
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
final double bm25Factor = getBM25(keywordSet, length);
|
final double bm25Factor = getBM25(keywordSet, length);
|
||||||
final double minCountFactor = getMinCountFactor(keywordSet);
|
|
||||||
|
|
||||||
bestScore = min(bestScore, bm25Factor * minCountFactor);
|
bestScore = min(bestScore, bm25Factor);
|
||||||
|
|
||||||
bestAllTermsFactor = min(bestAllTermsFactor, getAllTermsFactorForSet(keywordSet, titleLength));
|
bestAllTermsFactor = min(bestAllTermsFactor, getAllTermsFactorForSet(keywordSet, titleLength));
|
||||||
|
|
||||||
@ -96,23 +95,6 @@ public class SearchResultValuator {
|
|||||||
.orElse(false);
|
.orElse(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private double getMinCountFactor(SearchResultsKeywordSet keywordSet) {
|
|
||||||
// Penalize results with few keyword hits
|
|
||||||
|
|
||||||
int min = 32;
|
|
||||||
|
|
||||||
for (var keyword : keywordSet) {
|
|
||||||
if (!keyword.wordMetadata.hasFlag(EdgePageWordFlags.Title) && keyword.score.isRegular()) {
|
|
||||||
min = min(min, keyword.count());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (min <= 1) return 2;
|
|
||||||
if (min <= 2) return 1.5;
|
|
||||||
if (min <= 3) return 1.25;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
private double getBM25(SearchResultsKeywordSet keywordSet, int length) {
|
private double getBM25(SearchResultsKeywordSet keywordSet, int length) {
|
||||||
final double scalingFactor = 750.;
|
final double scalingFactor = 750.;
|
||||||
|
|
||||||
|
@ -0,0 +1,91 @@
|
|||||||
|
package nu.marginalia.search.valuation;
|
||||||
|
|
||||||
|
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
||||||
|
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||||
|
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||||
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
|
import java.util.EnumSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
class SearchResultValuatorTest {
|
||||||
|
|
||||||
|
TermFrequencyDict dict;
|
||||||
|
SearchResultValuator valuator;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
|
||||||
|
dict = Mockito.mock(TermFrequencyDict.class);
|
||||||
|
when(dict.docCount()).thenReturn(100_000);
|
||||||
|
|
||||||
|
valuator = new SearchResultValuator(dict);
|
||||||
|
|
||||||
|
}
|
||||||
|
List<EdgeSearchResultKeywordScore> titleOnlyLowCountSet = List.of(
|
||||||
|
new EdgeSearchResultKeywordScore(0, "bob",
|
||||||
|
wordMetadata(32, Set.of(1), EnumSet.of(EdgePageWordFlags.Title)),
|
||||||
|
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||||
|
false)
|
||||||
|
);
|
||||||
|
List<EdgeSearchResultKeywordScore> highCountNoTitleSet = List.of(
|
||||||
|
new EdgeSearchResultKeywordScore(0, "bob",
|
||||||
|
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
|
||||||
|
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||||
|
false)
|
||||||
|
);
|
||||||
|
|
||||||
|
List<EdgeSearchResultKeywordScore> highCountSubjectSet = List.of(
|
||||||
|
new EdgeSearchResultKeywordScore(0, "bob",
|
||||||
|
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh, EdgePageWordFlags.Subjects)),
|
||||||
|
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||||
|
false)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
List<EdgeSearchResultKeywordScore> first = List.of(
|
||||||
|
new EdgeSearchResultKeywordScore(0, "bob",
|
||||||
|
wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
|
||||||
|
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||||
|
false)
|
||||||
|
);
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void evaluateTerms() {
|
||||||
|
|
||||||
|
when(dict.getTermFreq("bob")).thenReturn(10L);
|
||||||
|
|
||||||
|
double titleOnlyLowCount = valuator.evaluateTerms(titleOnlyLowCountSet, 10_000, 32);
|
||||||
|
double titleLongOnlyLowCount = valuator.evaluateTerms(titleOnlyLowCountSet, 10_000, 72);
|
||||||
|
double highCountNoTitle = valuator.evaluateTerms(highCountNoTitleSet, 10_000, 32);
|
||||||
|
double highCountSubject = valuator.evaluateTerms(highCountSubjectSet, 10_000, 32);
|
||||||
|
|
||||||
|
System.out.println(titleOnlyLowCount);
|
||||||
|
System.out.println(titleLongOnlyLowCount);
|
||||||
|
System.out.println(highCountNoTitle);
|
||||||
|
System.out.println(highCountSubject);
|
||||||
|
}
|
||||||
|
|
||||||
|
private long docMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
||||||
|
return new DocumentMetadata(topology, PubDate.toYearByte(year), sets, quality, flags).encode();
|
||||||
|
}
|
||||||
|
|
||||||
|
private long wordMetadata(int tfIdf, Set<Integer> positions, Set<EdgePageWordFlags> wordFlags) {
|
||||||
|
int posBits = positions.stream()
|
||||||
|
.mapToInt(i -> (int)((1L << i) & 0xFFFF_FFFFL))
|
||||||
|
.reduce((a,b) -> a|b)
|
||||||
|
.orElse(0);
|
||||||
|
|
||||||
|
return new WordMetadata(tfIdf, posBits, positions.size(), wordFlags).encode();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user