mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Include external link texts in verbatim score
This commit is contained in:
parent
2080e31616
commit
ca6e2db2b9
@ -13,6 +13,8 @@ public class DocumentSpans {
|
|||||||
public DocumentSpan code = EMPTY_SPAN;
|
public DocumentSpan code = EMPTY_SPAN;
|
||||||
public DocumentSpan anchor = EMPTY_SPAN;
|
public DocumentSpan anchor = EMPTY_SPAN;
|
||||||
|
|
||||||
|
public DocumentSpan externalLinkText = EMPTY_SPAN;
|
||||||
|
|
||||||
void accept(byte code, CodedSequence positions) {
|
void accept(byte code, CodedSequence positions) {
|
||||||
if (code == HtmlTag.HEADING.code)
|
if (code == HtmlTag.HEADING.code)
|
||||||
this.heading = new DocumentSpan(positions);
|
this.heading = new DocumentSpan(positions);
|
||||||
@ -24,6 +26,8 @@ public class DocumentSpans {
|
|||||||
this.code = new DocumentSpan(positions);
|
this.code = new DocumentSpan(positions);
|
||||||
else if (code == HtmlTag.ANCHOR.code)
|
else if (code == HtmlTag.ANCHOR.code)
|
||||||
this.anchor = new DocumentSpan(positions);
|
this.anchor = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.EXTERNAL_LINKTEXT.code)
|
||||||
|
this.externalLinkText = new DocumentSpan(positions);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -199,7 +199,7 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
final int titleLength = Math.max(1, spans.title.length());
|
final int titleLength = Math.max(1, spans.title.length());
|
||||||
|
|
||||||
float coherenceScore = 0.f;
|
float verbatimMatchScore = 0.f;
|
||||||
|
|
||||||
boolean verbatimMatchInTitle;
|
boolean verbatimMatchInTitle;
|
||||||
boolean verbatimMatchInHeading;
|
boolean verbatimMatchInHeading;
|
||||||
@ -207,6 +207,7 @@ public class IndexResultScoreCalculator {
|
|||||||
boolean verbatimMatchInNav;
|
boolean verbatimMatchInNav;
|
||||||
boolean verbatimMatchInCode;
|
boolean verbatimMatchInCode;
|
||||||
boolean verbatimMatchInBody;
|
boolean verbatimMatchInBody;
|
||||||
|
boolean verbatimMatchInExtLink;
|
||||||
|
|
||||||
// Calculate a bonus for keyword coherences when large ones exist
|
// Calculate a bonus for keyword coherences when large ones exist
|
||||||
int largestOptional = coherences.largestOptional();
|
int largestOptional = coherences.largestOptional();
|
||||||
@ -216,6 +217,7 @@ public class IndexResultScoreCalculator {
|
|||||||
verbatimMatchInAnchor = (largestOptional == coherences.testOptional(positions, spans.anchor));
|
verbatimMatchInAnchor = (largestOptional == coherences.testOptional(positions, spans.anchor));
|
||||||
verbatimMatchInNav = (largestOptional == coherences.testOptional(positions, spans.nav));
|
verbatimMatchInNav = (largestOptional == coherences.testOptional(positions, spans.nav));
|
||||||
verbatimMatchInCode = (largestOptional == coherences.testOptional(positions, spans.code));
|
verbatimMatchInCode = (largestOptional == coherences.testOptional(positions, spans.code));
|
||||||
|
verbatimMatchInExtLink = (largestOptional == coherences.testOptional(positions, spans.code));
|
||||||
verbatimMatchInBody = (largestOptional == coherences.testOptional(positions));
|
verbatimMatchInBody = (largestOptional == coherences.testOptional(positions));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@ -225,29 +227,34 @@ public class IndexResultScoreCalculator {
|
|||||||
verbatimMatchInNav = false;
|
verbatimMatchInNav = false;
|
||||||
verbatimMatchInCode = false;
|
verbatimMatchInCode = false;
|
||||||
verbatimMatchInBody = false;
|
verbatimMatchInBody = false;
|
||||||
|
verbatimMatchInExtLink = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (verbatimMatchInTitle) {
|
if (verbatimMatchInTitle) {
|
||||||
// verbatim title match
|
// verbatim title match
|
||||||
coherenceScore = 4.0f * largestOptional;
|
verbatimMatchScore = 4.0f * largestOptional;
|
||||||
// additional bonus if the match is most of the title's length
|
// additional bonus if the match is most of the title's length
|
||||||
coherenceScore += 2.f * largestOptional / titleLength;
|
verbatimMatchScore += 2.f * largestOptional / titleLength;
|
||||||
}
|
}
|
||||||
else if (verbatimMatchInHeading) {
|
else if (verbatimMatchInHeading) {
|
||||||
coherenceScore = 1.5f * largestOptional;
|
verbatimMatchScore = 1.5f * largestOptional;
|
||||||
}
|
}
|
||||||
else if (verbatimMatchInAnchor || verbatimMatchInCode) {
|
else if (verbatimMatchInAnchor || verbatimMatchInCode) {
|
||||||
coherenceScore = 0.2f * largestOptional;
|
verbatimMatchScore = 0.2f * largestOptional;
|
||||||
}
|
}
|
||||||
else if (verbatimMatchInNav) {
|
else if (verbatimMatchInNav) {
|
||||||
coherenceScore = 0.1f * largestOptional;
|
verbatimMatchScore = 0.1f * largestOptional;
|
||||||
}
|
}
|
||||||
else if (verbatimMatchInBody) {
|
else if (verbatimMatchInBody) {
|
||||||
coherenceScore = 0.75f * largestOptional;
|
verbatimMatchScore = 0.75f * largestOptional;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (coherences.numOptional() > 0) {
|
if (coherences.numOptional() > 0) {
|
||||||
coherenceScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
|
verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbatimMatchInExtLink) { // Nudge the result up if there is a verbatim match in the external link text
|
||||||
|
verbatimMatchScore += 1.0f * largestOptional;
|
||||||
}
|
}
|
||||||
|
|
||||||
float[] weightedCounts = new float[compiledQuery.size()];
|
float[] weightedCounts = new float[compiledQuery.size()];
|
||||||
@ -318,12 +325,12 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!verbatimMatchInTitle && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
|
if (!verbatimMatchInTitle && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
|
||||||
coherenceScore += 2.5f * unorderedMatchInTitleCount;
|
verbatimMatchScore += 2.5f * unorderedMatchInTitleCount;
|
||||||
coherenceScore += 2.f * unorderedMatchInTitleCount / titleLength;
|
verbatimMatchScore += 2.f * unorderedMatchInTitleCount / titleLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!verbatimMatchInHeading && unorderedMatchInHeadingCount == searchableKeywordsCount) {
|
if (!verbatimMatchInHeading && unorderedMatchInHeadingCount == searchableKeywordsCount) {
|
||||||
coherenceScore += 2.0f * unorderedMatchInHeadingCount;
|
verbatimMatchScore += 2.0f * unorderedMatchInHeadingCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
double overallPart = averageSentenceLengthPenalty
|
double overallPart = averageSentenceLengthPenalty
|
||||||
@ -333,7 +340,7 @@ public class IndexResultScoreCalculator {
|
|||||||
+ topologyBonus
|
+ topologyBonus
|
||||||
+ temporalBias
|
+ temporalBias
|
||||||
+ flagsPenalty
|
+ flagsPenalty
|
||||||
+ coherenceScore
|
+ verbatimMatchScore
|
||||||
+ keywordMinDistFac;
|
+ keywordMinDistFac;
|
||||||
|
|
||||||
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
|
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
|
||||||
|
Loading…
Reference in New Issue
Block a user