mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Search ranking debuggability improvements.
This commit is contained in:
parent
3fb249758e
commit
716ab35b4e
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.model.idx;
|
||||
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
|
||||
import java.util.EnumSet;
|
||||
@ -17,6 +18,20 @@ public record DocumentMetadata(int avgSentLength,
|
||||
int quality,
|
||||
byte flags) {
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
|
||||
sb.append('[')
|
||||
.append("avgSentL=").append(avgSentLength).append(", ")
|
||||
.append("rank=").append(rank).append(", ")
|
||||
.append("domainSize=").append(ENC_DOMAIN_SIZE_MULTIPLIER * encDomainSize).append(", ")
|
||||
.append("topology=").append(topology).append(", ")
|
||||
.append("year=").append(PubDate.fromYearByte(year)).append(", ")
|
||||
.append("sets=").append(sets).append(", ")
|
||||
.append("quality=").append(quality).append(", ")
|
||||
.append("flags=").append(flagSet()).append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static final long ASL_MASK = 0x03L;
|
||||
public static final int ASL_SHIFT = 56;
|
||||
|
||||
@ -133,4 +148,8 @@ public record DocumentMetadata(int avgSentLength,
|
||||
return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
|
||||
}
|
||||
|
||||
public EnumSet<DocumentFlags> flagSet() {
|
||||
return DocumentFlags.decode(flags);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -35,6 +35,7 @@ dependencies {
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
testImplementation project(':code:processes:converting-process')
|
||||
}
|
||||
|
||||
test {
|
||||
|
@ -4,6 +4,7 @@ import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@ -85,4 +86,16 @@ public class DocumentKeywordsBuilder {
|
||||
return words.size();
|
||||
}
|
||||
|
||||
public WordMetadata getMetaForWord(String word) {
|
||||
return new WordMetadata(words.getLong(word));
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("[ ");
|
||||
words.forEach((word, meta) -> {
|
||||
sb.append(word).append("->").append(new WordMetadata(meta)).append(' ');
|
||||
});
|
||||
return sb.append(']').toString();
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,20 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Objects;
|
||||
|
||||
class DocumentKeywordExtractorTest {
|
||||
|
||||
@Test
|
||||
@ -21,4 +33,27 @@ class DocumentKeywordExtractorTest {
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeyboards() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
|
||||
System.out.println(keywords.getMetaForWord("mechanical"));
|
||||
System.out.println(keywords.getMetaForWord("keyboard"));
|
||||
System.out.println(keywords.getMetaForWord("keyboards"));
|
||||
|
||||
System.out.println(new WordMetadata(8894889328781L));
|
||||
System.out.println(new WordMetadata(4294967297L));
|
||||
System.out.println(new WordMetadata(566820053975498886L));
|
||||
// -
|
||||
System.out.println(new WordMetadata(1198298103937L));
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -89,7 +89,8 @@ public class DbUrlDetailsQuery {
|
||||
Double.MAX_VALUE, // termScore
|
||||
1, // resultsFromSameDomain
|
||||
"", // positions
|
||||
null // result item
|
||||
null, // result item
|
||||
null // keyword scores
|
||||
);
|
||||
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
|
||||
&& Strings.isNullOrEmpty(val.description)
|
||||
|
@ -2,11 +2,13 @@ package nu.marginalia.search.model;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
@ -37,6 +39,7 @@ public class UrlDetails {
|
||||
|
||||
public String positions;
|
||||
public SearchResultItem resultItem;
|
||||
public List<SearchResultKeywordScore> keywordScores;
|
||||
|
||||
public boolean hasMoreResults() {
|
||||
return resultsFromSameDomain > 1;
|
||||
|
@ -64,6 +64,9 @@ public class SearchResultDecorator {
|
||||
|
||||
details.resultsFromSameDomain = resultItem.resultsFromDomain;
|
||||
details.termScore = calculateTermScore(resultItem, details, resultSet.rankingContext);
|
||||
if (getClass().desiredAssertionStatus()) {
|
||||
details.keywordScores = resultItem.keywordScores;
|
||||
}
|
||||
details.positions = getPositionsString(resultItem);
|
||||
details.resultItem = resultItem;
|
||||
|
||||
|
@ -1,4 +1,8 @@
|
||||
<!-- {{rankingId}}.{{id}}/{{ranking}}/{{termScore}} -->
|
||||
<!--
|
||||
{{#each keywordScores}} {{{.}}}
|
||||
{{/each}}
|
||||
-->
|
||||
<section class="card search-result {{#unless hideRanking}}rs-rank-{{logRank}} ms-rank-{{matchRank}}{{/unless}} {{#if specialDomain}}special-domain{{/if}}" >
|
||||
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
|
||||
<h2> <a tabindex="-1" class="title" rel="nofollow external" href="{{url}}">{{title}}</a> </h2>
|
||||
|
@ -37,7 +37,9 @@ public class LoadTestMain {
|
||||
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
var rsp = client.send(req, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
client.send(req, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
long stopTime = System.currentTimeMillis();
|
||||
|
||||
times.add(stopTime - startTime);
|
||||
@ -45,7 +47,6 @@ public class LoadTestMain {
|
||||
System.out.println(times.stream().mapToLong(Long::longValue).average().orElse(-1));
|
||||
times.clear();
|
||||
}
|
||||
// System.out.println(stopTime - startTime);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user