(index) Simplify verbatim match calculation

This commit is contained in:
Viktor Lofgren 2024-08-10 09:50:10 +02:00
parent 4264fb9f49
commit 41b52f5bcd
5 changed files with 139 additions and 86 deletions

View File

@ -24,6 +24,7 @@ dependencies {
implementation project(':code:libraries:btree')
implementation project(':code:libraries:slop')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:common:db')
implementation project(':code:common:config')

View File

@ -15,6 +15,22 @@ public class DocumentSpans {
public DocumentSpan externalLinkText = EMPTY_SPAN;
public DocumentSpan getSpan(HtmlTag tag) {
if (tag == HtmlTag.HEADING)
return heading;
else if (tag == HtmlTag.TITLE)
return title;
else if (tag == HtmlTag.NAV)
return nav;
else if (tag == HtmlTag.CODE)
return code;
else if (tag == HtmlTag.ANCHOR)
return anchor;
else if (tag == HtmlTag.EXTERNAL_LINKTEXT)
return externalLinkText;
return EMPTY_SPAN;
}
void accept(byte code, CodedSequence positions) {
if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions);

View File

@ -15,6 +15,7 @@ import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.id.UrlIdCodec;
@ -27,6 +28,7 @@ import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable;
import java.lang.foreign.Arena;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
@ -137,6 +139,8 @@ public class IndexResultScoreCalculator {
return true;
}
public double calculateSearchResultValue(DebugRankingFactors rankingFactors,
QuerySearchTerms searchTerms,
CompiledQueryLong wordFlagsQuery,
@ -181,67 +185,13 @@ public class IndexResultScoreCalculator {
final int titleLength = Math.max(1, spans.title.length());
float verbatimMatchScore = 0.f;
VerbatimMatches verbatimMatches = new VerbatimMatches();
boolean verbatimMatchInTitle;
boolean verbatimMatchInHeading;
boolean verbatimMatchInAnchor;
boolean verbatimMatchInNav;
boolean verbatimMatchInCode;
boolean verbatimMatchInBody;
boolean verbatimMatchInExtLink;
// Calculate a bonus for keyword coherences when large ones exist
int largestOptional = coherences.largestOptional();
if (largestOptional >= 2) {
verbatimMatchInTitle = (largestOptional == coherences.testOptional(positions, spans.title));
verbatimMatchInHeading = (largestOptional == coherences.testOptional(positions, spans.heading));
verbatimMatchInAnchor = (largestOptional == coherences.testOptional(positions, spans.anchor));
verbatimMatchInNav = (largestOptional == coherences.testOptional(positions, spans.nav));
verbatimMatchInCode = (largestOptional == coherences.testOptional(positions, spans.code));
verbatimMatchInExtLink = (largestOptional == coherences.testOptional(positions, spans.code));
verbatimMatchInBody = (largestOptional == coherences.testOptional(positions));
}
else {
verbatimMatchInTitle = false;
verbatimMatchInHeading = false;
verbatimMatchInAnchor = false;
verbatimMatchInNav = false;
verbatimMatchInCode = false;
verbatimMatchInBody = false;
verbatimMatchInExtLink = false;
}
if (verbatimMatchInTitle) {
// verbatim title match
verbatimMatchScore = 4.0f * largestOptional;
// additional bonus if the match is most of the title's length
verbatimMatchScore += 2.f * largestOptional / titleLength;
}
else if (verbatimMatchInHeading) {
verbatimMatchScore = 1.5f * largestOptional;
}
else if (verbatimMatchInAnchor || verbatimMatchInCode) {
verbatimMatchScore = 0.2f * largestOptional;
}
else if (verbatimMatchInNav) {
verbatimMatchScore = 0.1f * largestOptional;
}
else if (verbatimMatchInBody) {
verbatimMatchScore = 0.75f * largestOptional;
}
if (coherences.numOptional() > 0) {
verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
}
if (verbatimMatchInExtLink) { // Nudge the result up if there is a verbatim match in the external link text
verbatimMatchScore += 1.0f * largestOptional;
}
float verbatimMatchScore = findVerbatimMatches(verbatimMatches, coherences, positions, spans);
float[] weightedCounts = new float[compiledQuery.size()];
int firstPosition = Integer.MAX_VALUE;
float keywordMinDistFac = 0;
if (positions.length > 2) {
List<IntIterator> iterators = new ArrayList<>(positions.length);
@ -268,6 +218,7 @@ public class IndexResultScoreCalculator {
int unorderedMatchInTitleCount = 0;
int unorderedMatchInHeadingCount = 0;
int firstPosition = 0;
for (int i = 0; i < weightedCounts.length; i++) {
if (positions[i] != null && ctx.regularMask.get(i)) {
searchableKeywordsCount ++;
@ -312,12 +263,12 @@ public class IndexResultScoreCalculator {
}
}
if (!verbatimMatchInTitle && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
if (!verbatimMatches.get(HtmlTag.TITLE) && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
verbatimMatchScore += 2.5f * unorderedMatchInTitleCount;
verbatimMatchScore += 2.f * unorderedMatchInTitleCount / titleLength;
}
if (!verbatimMatchInHeading && unorderedMatchInHeadingCount == searchableKeywordsCount) {
if (!verbatimMatches.get(HtmlTag.HEADING) && unorderedMatchInHeadingCount == searchableKeywordsCount) {
verbatimMatchScore += 2.0f * unorderedMatchInHeadingCount;
}
@ -373,26 +324,10 @@ public class IndexResultScoreCalculator {
}
}
if (verbatimMatchInAnchor) {
rankingFactors.addTermFactor(termId, "verbatim.anchor", "true");
}
if (verbatimMatchInBody) {
rankingFactors.addTermFactor(termId, "verbatim.body", "true");
}
if (verbatimMatchInCode) {
rankingFactors.addTermFactor(termId, "verbatim.code", "true");
}
if (verbatimMatchInExtLink) {
rankingFactors.addTermFactor(termId, "verbatim.extLink", "true");
}
if (verbatimMatchInHeading) {
rankingFactors.addTermFactor(termId, "verbatim.heading", "true");
}
if (verbatimMatchInNav) {
rankingFactors.addTermFactor(termId, "verbatim.nav", "true");
}
if (verbatimMatchInTitle) {
rankingFactors.addTermFactor(termId, "verbatim.title", "true");
for (HtmlTag tag : HtmlTag.includedTags) {
if (verbatimMatches.get(tag)) {
rankingFactors.addTermFactor(termId, "verbatim." + tag.name().toLowerCase(), "true");
}
}
if (positions[i] != null) {
@ -430,6 +365,82 @@ public class IndexResultScoreCalculator {
}
}
private float findVerbatimMatches(VerbatimMatches verbatimMatches,
TermCoherenceGroupList coherences,
CodedSequence[] positions,
DocumentSpans spans) {
// Calculate a bonus for keyword coherences when large ones exist
int largestOptional = coherences.largestOptional();
if (largestOptional < 2) {
return 0;
}
float verbatimMatchScore = 0.f;
for (var optionalGroup : coherences.getOptionalGroups()) {
int groupSize = optionalGroup.size;
float sizeScalingFactor = groupSize / (float) largestOptional;
for (var tag : HtmlTag.includedTags) {
if (optionalGroup.test(spans.getSpan(tag), positions)) {
verbatimMatchScore += verbatimMatches.getWeight(tag) * sizeScalingFactor * groupSize;
if (optionalGroup.size == largestOptional) {
verbatimMatches.set(tag);
}
}
}
}
if (coherences.numOptional() > 0) {
verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
}
return verbatimMatchScore;
}
private static class VerbatimMatches {
private final BitSet matches;
private final float[] weights;
public VerbatimMatches() {
matches = new BitSet(HtmlTag.includedTags.length);
weights = new float[] { HtmlTag.includedTags.length };
for (int i = 0; i < weights.length; i++) {
weights[i] = switch(HtmlTag.includedTags[i]) {
case TITLE -> 4.0f;
case HEADING -> 1.5f;
case ANCHOR -> 0.2f;
case NAV -> 0.1f;
case CODE -> 0.25f;
case EXTERNAL_LINKTEXT -> 1.0f;
default -> 0.0f;
};
}
}
public boolean get(HtmlTag tag) {
assert !tag.exclude;
return matches.get(tag.ordinal());
}
public void set(HtmlTag tag) {
assert !tag.exclude;
matches.set(tag.ordinal());
}
public float getWeight(HtmlTag tag) {
assert !tag.exclude;
return weights[tag.ordinal()];
}
}
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
if (size < 400) {
if (quality < 5)

View File

@ -10,6 +10,7 @@ import nu.marginalia.sequence.SequenceOperations;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collections;
import java.util.List;
/**
@ -29,6 +30,10 @@ public class TermCoherenceGroupList {
}
}
public List<TermCoherenceGroup> getOptionalGroups() {
return Collections.unmodifiableList(optionalGroups);
}
public boolean testMandatory(CodedSequence[] positions) {
for (var coherenceSet : mandatoryGroups) {

View File

@ -1,24 +1,27 @@
package nu.marginalia.language.sentence.tag;
public enum HtmlTag {
FORM((byte) 0, true, false),
SCRIPT((byte) 0, true, false),
STYLE((byte) 0, true, false),
import java.util.Arrays;
public enum HtmlTag {
ANCHOR((byte) 'a', false, false),
TITLE((byte) 't', false, false),
HEADING((byte) 'h', false, false),
CODE((byte) 'c', false, true),
NAV((byte) 'n', false, false),
// pseudo-tags for internal use
// pseudo-tags for internal use,
BODY((byte) 'b', false, false),
EXTERNAL_LINKTEXT((byte) 'x', false, false),
// excluded tags must be put last!
FORM((byte) 0, true, false),
SCRIPT((byte) 0, true, false),
STYLE((byte) 0, true, false),
;
public byte code;
public boolean exclude;
public boolean nonLanguage;
public final byte code;
public final boolean exclude;
public final boolean nonLanguage;
HtmlTag(byte code, boolean exclude, boolean nonLanguage) {
this.code = code;
@ -26,4 +29,21 @@ public enum HtmlTag {
this.nonLanguage = nonLanguage;
}
// This is a bit of a hack to get the included tags in the order they are defined in the enum
public static final HtmlTag[] includedTags;
static {
HtmlTag[] values = values();
includedTags = new HtmlTag[(int) Arrays.stream(values).filter(tag -> !tag.exclude).count()];
for (int i = 0; i < values.length; i++) {
if (i != values[i].ordinal()) {
throw new IllegalStateException("Excluded tags must be put last");
}
if (!values()[i].exclude) {
includedTags[i] = values()[i];
}
}
}
}