Search ranking debuggability improvements.

This commit is contained in:
Viktor Lofgren 2023-04-02 13:43:24 +02:00
parent 3fb249758e
commit 716ab35b4e
10 changed files with 577 additions and 3 deletions

View File

@ -1,5 +1,6 @@
package nu.marginalia.model.idx;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.model.crawl.PubDate;
import java.util.EnumSet;
@ -17,6 +18,20 @@ public record DocumentMetadata(int avgSentLength,
int quality,
byte flags) {
public String toString() {
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
sb.append('[')
.append("avgSentL=").append(avgSentLength).append(", ")
.append("rank=").append(rank).append(", ")
.append("domainSize=").append(ENC_DOMAIN_SIZE_MULTIPLIER * encDomainSize).append(", ")
.append("topology=").append(topology).append(", ")
.append("year=").append(PubDate.fromYearByte(year)).append(", ")
.append("sets=").append(sets).append(", ")
.append("quality=").append(quality).append(", ")
.append("flags=").append(flagSet()).append("]");
return sb.toString();
}
public static final long ASL_MASK = 0x03L;
public static final int ASL_SHIFT = 56;
@ -133,4 +148,8 @@ public record DocumentMetadata(int avgSentLength,
return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
}
public EnumSet<DocumentFlags> flagSet() {
return DocumentFlags.decode(flags);
}
}

View File

@ -35,6 +35,7 @@ dependencies {
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:processes:converting-process')
}
test {

View File

@ -4,6 +4,7 @@ import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import java.util.*;
@ -85,4 +86,16 @@ public class DocumentKeywordsBuilder {
return words.size();
}
public WordMetadata getMetaForWord(String word) {
return new WordMetadata(words.getLong(word));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[ ");
words.forEach((word, meta) -> {
sb.append(word).append("->").append(new WordMetadata(meta)).append(' ');
});
return sb.append(']').toString();
}
}

View File

@ -1,8 +1,20 @@
package nu.marginalia.keyword;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.Objects;
class DocumentKeywordExtractorTest {
@Test
@ -21,4 +33,27 @@ class DocumentKeywordExtractorTest {
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
}
@Test
public void testKeyboards() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
System.out.println(keywords.getMetaForWord("mechanical"));
System.out.println(keywords.getMetaForWord("keyboard"));
System.out.println(keywords.getMetaForWord("keyboards"));
System.out.println(new WordMetadata(8894889328781L));
System.out.println(new WordMetadata(4294967297L));
System.out.println(new WordMetadata(566820053975498886L));
// -
System.out.println(new WordMetadata(1198298103937L));
}
}

File diff suppressed because one or more lines are too long

View File

@ -89,7 +89,8 @@ public class DbUrlDetailsQuery {
Double.MAX_VALUE, // termScore
1, // resultsFromSameDomain
"", // positions
null // result item
null, // result item
null // keyword scores
);
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
&& Strings.isNullOrEmpty(val.description)

View File

@ -2,11 +2,13 @@ package nu.marginalia.search.model;
import lombok.*;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature;
import java.util.EnumSet;
import java.util.List;
import java.util.Objects;
import java.util.StringJoiner;
@ -37,6 +39,7 @@ public class UrlDetails {
public String positions;
public SearchResultItem resultItem;
public List<SearchResultKeywordScore> keywordScores;
public boolean hasMoreResults() {
return resultsFromSameDomain > 1;

View File

@ -64,6 +64,9 @@ public class SearchResultDecorator {
details.resultsFromSameDomain = resultItem.resultsFromDomain;
details.termScore = calculateTermScore(resultItem, details, resultSet.rankingContext);
if (getClass().desiredAssertionStatus()) {
details.keywordScores = resultItem.keywordScores;
}
details.positions = getPositionsString(resultItem);
details.resultItem = resultItem;

View File

@ -1,4 +1,8 @@
<!-- {{rankingId}}.{{id}}/{{ranking}}/{{termScore}} -->
<!--
{{#each keywordScores}} {{{.}}}
{{/each}}
-->
<section class="card search-result {{#unless hideRanking}}rs-rank-{{logRank}} ms-rank-{{matchRank}}{{/unless}} {{#if specialDomain}}special-domain{{/if}}" >
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
<h2> <a tabindex="-1" class="title" rel="nofollow external" href="{{url}}">{{title}}</a> </h2>

View File

@ -37,7 +37,9 @@ public class LoadTestMain {
long startTime = System.currentTimeMillis();
var rsp = client.send(req, HttpResponse.BodyHandlers.ofString());
client.send(req, HttpResponse.BodyHandlers.ofString());
long stopTime = System.currentTimeMillis();
times.add(stopTime - startTime);
@ -45,7 +47,6 @@ public class LoadTestMain {
System.out.println(times.stream().mapToLong(Long::longValue).average().orElse(-1));
times.clear();
}
// System.out.println(stopTime - startTime);
}
}