mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Include external link texts in verbatim score
This commit is contained in:
parent
2080e31616
commit
ca6e2db2b9
@ -13,6 +13,8 @@ public class DocumentSpans {
|
||||
public DocumentSpan code = EMPTY_SPAN;
|
||||
public DocumentSpan anchor = EMPTY_SPAN;
|
||||
|
||||
public DocumentSpan externalLinkText = EMPTY_SPAN;
|
||||
|
||||
void accept(byte code, CodedSequence positions) {
|
||||
if (code == HtmlTag.HEADING.code)
|
||||
this.heading = new DocumentSpan(positions);
|
||||
@ -24,6 +26,8 @@ public class DocumentSpans {
|
||||
this.code = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.ANCHOR.code)
|
||||
this.anchor = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.EXTERNAL_LINKTEXT.code)
|
||||
this.externalLinkText = new DocumentSpan(positions);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -199,7 +199,7 @@ public class IndexResultScoreCalculator {
|
||||
|
||||
final int titleLength = Math.max(1, spans.title.length());
|
||||
|
||||
float coherenceScore = 0.f;
|
||||
float verbatimMatchScore = 0.f;
|
||||
|
||||
boolean verbatimMatchInTitle;
|
||||
boolean verbatimMatchInHeading;
|
||||
@ -207,6 +207,7 @@ public class IndexResultScoreCalculator {
|
||||
boolean verbatimMatchInNav;
|
||||
boolean verbatimMatchInCode;
|
||||
boolean verbatimMatchInBody;
|
||||
boolean verbatimMatchInExtLink;
|
||||
|
||||
// Calculate a bonus for keyword coherences when large ones exist
|
||||
int largestOptional = coherences.largestOptional();
|
||||
@ -216,6 +217,7 @@ public class IndexResultScoreCalculator {
|
||||
verbatimMatchInAnchor = (largestOptional == coherences.testOptional(positions, spans.anchor));
|
||||
verbatimMatchInNav = (largestOptional == coherences.testOptional(positions, spans.nav));
|
||||
verbatimMatchInCode = (largestOptional == coherences.testOptional(positions, spans.code));
|
||||
verbatimMatchInExtLink = (largestOptional == coherences.testOptional(positions, spans.code));
|
||||
verbatimMatchInBody = (largestOptional == coherences.testOptional(positions));
|
||||
}
|
||||
else {
|
||||
@ -225,29 +227,34 @@ public class IndexResultScoreCalculator {
|
||||
verbatimMatchInNav = false;
|
||||
verbatimMatchInCode = false;
|
||||
verbatimMatchInBody = false;
|
||||
verbatimMatchInExtLink = false;
|
||||
}
|
||||
|
||||
if (verbatimMatchInTitle) {
|
||||
// verbatim title match
|
||||
coherenceScore = 4.0f * largestOptional;
|
||||
verbatimMatchScore = 4.0f * largestOptional;
|
||||
// additional bonus if the match is most of the title's length
|
||||
coherenceScore += 2.f * largestOptional / titleLength;
|
||||
verbatimMatchScore += 2.f * largestOptional / titleLength;
|
||||
}
|
||||
else if (verbatimMatchInHeading) {
|
||||
coherenceScore = 1.5f * largestOptional;
|
||||
verbatimMatchScore = 1.5f * largestOptional;
|
||||
}
|
||||
else if (verbatimMatchInAnchor || verbatimMatchInCode) {
|
||||
coherenceScore = 0.2f * largestOptional;
|
||||
verbatimMatchScore = 0.2f * largestOptional;
|
||||
}
|
||||
else if (verbatimMatchInNav) {
|
||||
coherenceScore = 0.1f * largestOptional;
|
||||
verbatimMatchScore = 0.1f * largestOptional;
|
||||
}
|
||||
else if (verbatimMatchInBody) {
|
||||
coherenceScore = 0.75f * largestOptional;
|
||||
verbatimMatchScore = 0.75f * largestOptional;
|
||||
}
|
||||
|
||||
if (coherences.numOptional() > 0) {
|
||||
coherenceScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
|
||||
verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
|
||||
}
|
||||
|
||||
if (verbatimMatchInExtLink) { // Nudge the result up if there is a verbatim match in the external link text
|
||||
verbatimMatchScore += 1.0f * largestOptional;
|
||||
}
|
||||
|
||||
float[] weightedCounts = new float[compiledQuery.size()];
|
||||
@ -318,12 +325,12 @@ public class IndexResultScoreCalculator {
|
||||
}
|
||||
|
||||
if (!verbatimMatchInTitle && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
|
||||
coherenceScore += 2.5f * unorderedMatchInTitleCount;
|
||||
coherenceScore += 2.f * unorderedMatchInTitleCount / titleLength;
|
||||
verbatimMatchScore += 2.5f * unorderedMatchInTitleCount;
|
||||
verbatimMatchScore += 2.f * unorderedMatchInTitleCount / titleLength;
|
||||
}
|
||||
|
||||
if (!verbatimMatchInHeading && unorderedMatchInHeadingCount == searchableKeywordsCount) {
|
||||
coherenceScore += 2.0f * unorderedMatchInHeadingCount;
|
||||
verbatimMatchScore += 2.0f * unorderedMatchInHeadingCount;
|
||||
}
|
||||
|
||||
double overallPart = averageSentenceLengthPenalty
|
||||
@ -333,7 +340,7 @@ public class IndexResultScoreCalculator {
|
||||
+ topologyBonus
|
||||
+ temporalBias
|
||||
+ flagsPenalty
|
||||
+ coherenceScore
|
||||
+ verbatimMatchScore
|
||||
+ keywordMinDistFac;
|
||||
|
||||
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
|
||||
|
Loading…
Reference in New Issue
Block a user