(index) Include external link texts in verbatim score

This commit is contained in:
Viktor Lofgren 2024-08-06 10:23:23 +02:00
parent 2080e31616
commit ca6e2db2b9
2 changed files with 23 additions and 12 deletions

View File

@ -13,6 +13,8 @@ public class DocumentSpans {
public DocumentSpan code = EMPTY_SPAN; public DocumentSpan code = EMPTY_SPAN;
public DocumentSpan anchor = EMPTY_SPAN; public DocumentSpan anchor = EMPTY_SPAN;
public DocumentSpan externalLinkText = EMPTY_SPAN;
void accept(byte code, CodedSequence positions) { void accept(byte code, CodedSequence positions) {
if (code == HtmlTag.HEADING.code) if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions); this.heading = new DocumentSpan(positions);
@ -24,6 +26,8 @@ public class DocumentSpans {
this.code = new DocumentSpan(positions); this.code = new DocumentSpan(positions);
else if (code == HtmlTag.ANCHOR.code) else if (code == HtmlTag.ANCHOR.code)
this.anchor = new DocumentSpan(positions); this.anchor = new DocumentSpan(positions);
else if (code == HtmlTag.EXTERNAL_LINKTEXT.code)
this.externalLinkText = new DocumentSpan(positions);
} }
} }

View File

@ -199,7 +199,7 @@ public class IndexResultScoreCalculator {
final int titleLength = Math.max(1, spans.title.length()); final int titleLength = Math.max(1, spans.title.length());
float coherenceScore = 0.f; float verbatimMatchScore = 0.f;
boolean verbatimMatchInTitle; boolean verbatimMatchInTitle;
boolean verbatimMatchInHeading; boolean verbatimMatchInHeading;
@ -207,6 +207,7 @@ public class IndexResultScoreCalculator {
boolean verbatimMatchInNav; boolean verbatimMatchInNav;
boolean verbatimMatchInCode; boolean verbatimMatchInCode;
boolean verbatimMatchInBody; boolean verbatimMatchInBody;
boolean verbatimMatchInExtLink;
// Calculate a bonus for keyword coherences when large ones exist // Calculate a bonus for keyword coherences when large ones exist
int largestOptional = coherences.largestOptional(); int largestOptional = coherences.largestOptional();
@ -216,6 +217,7 @@ public class IndexResultScoreCalculator {
verbatimMatchInAnchor = (largestOptional == coherences.testOptional(positions, spans.anchor)); verbatimMatchInAnchor = (largestOptional == coherences.testOptional(positions, spans.anchor));
verbatimMatchInNav = (largestOptional == coherences.testOptional(positions, spans.nav)); verbatimMatchInNav = (largestOptional == coherences.testOptional(positions, spans.nav));
verbatimMatchInCode = (largestOptional == coherences.testOptional(positions, spans.code)); verbatimMatchInCode = (largestOptional == coherences.testOptional(positions, spans.code));
verbatimMatchInExtLink = (largestOptional == coherences.testOptional(positions, spans.code));
verbatimMatchInBody = (largestOptional == coherences.testOptional(positions)); verbatimMatchInBody = (largestOptional == coherences.testOptional(positions));
} }
else { else {
@ -225,29 +227,34 @@ public class IndexResultScoreCalculator {
verbatimMatchInNav = false; verbatimMatchInNav = false;
verbatimMatchInCode = false; verbatimMatchInCode = false;
verbatimMatchInBody = false; verbatimMatchInBody = false;
verbatimMatchInExtLink = false;
} }
if (verbatimMatchInTitle) { if (verbatimMatchInTitle) {
// verbatim title match // verbatim title match
coherenceScore = 4.0f * largestOptional; verbatimMatchScore = 4.0f * largestOptional;
// additional bonus if the match is most of the title's length // additional bonus if the match is most of the title's length
coherenceScore += 2.f * largestOptional / titleLength; verbatimMatchScore += 2.f * largestOptional / titleLength;
} }
else if (verbatimMatchInHeading) { else if (verbatimMatchInHeading) {
coherenceScore = 1.5f * largestOptional; verbatimMatchScore = 1.5f * largestOptional;
} }
else if (verbatimMatchInAnchor || verbatimMatchInCode) { else if (verbatimMatchInAnchor || verbatimMatchInCode) {
coherenceScore = 0.2f * largestOptional; verbatimMatchScore = 0.2f * largestOptional;
} }
else if (verbatimMatchInNav) { else if (verbatimMatchInNav) {
coherenceScore = 0.1f * largestOptional; verbatimMatchScore = 0.1f * largestOptional;
} }
else if (verbatimMatchInBody) { else if (verbatimMatchInBody) {
coherenceScore = 0.75f * largestOptional; verbatimMatchScore = 0.75f * largestOptional;
} }
if (coherences.numOptional() > 0) { if (coherences.numOptional() > 0) {
coherenceScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2); verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
}
if (verbatimMatchInExtLink) { // Nudge the result up if there is a verbatim match in the external link text
verbatimMatchScore += 1.0f * largestOptional;
} }
float[] weightedCounts = new float[compiledQuery.size()]; float[] weightedCounts = new float[compiledQuery.size()];
@ -318,12 +325,12 @@ public class IndexResultScoreCalculator {
} }
if (!verbatimMatchInTitle && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) { if (!verbatimMatchInTitle && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
coherenceScore += 2.5f * unorderedMatchInTitleCount; verbatimMatchScore += 2.5f * unorderedMatchInTitleCount;
coherenceScore += 2.f * unorderedMatchInTitleCount / titleLength; verbatimMatchScore += 2.f * unorderedMatchInTitleCount / titleLength;
} }
if (!verbatimMatchInHeading && unorderedMatchInHeadingCount == searchableKeywordsCount) { if (!verbatimMatchInHeading && unorderedMatchInHeadingCount == searchableKeywordsCount) {
coherenceScore += 2.0f * unorderedMatchInHeadingCount; verbatimMatchScore += 2.0f * unorderedMatchInHeadingCount;
} }
double overallPart = averageSentenceLengthPenalty double overallPart = averageSentenceLengthPenalty
@ -333,7 +340,7 @@ public class IndexResultScoreCalculator {
+ topologyBonus + topologyBonus
+ temporalBias + temporalBias
+ flagsPenalty + flagsPenalty
+ coherenceScore + verbatimMatchScore
+ keywordMinDistFac; + keywordMinDistFac;
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx)); double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));