From ca6e2db2b9efb38ab46aeea7e77904345d170c3a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 6 Aug 2024 10:23:23 +0200 Subject: [PATCH] (index) Include external link texts in verbatim score --- .../index/forward/spans/DocumentSpans.java | 4 +++ .../results/IndexResultScoreCalculator.java | 31 ++++++++++++------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java index 6eebbd63..8f8d5cf5 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java @@ -13,6 +13,8 @@ public class DocumentSpans { public DocumentSpan code = EMPTY_SPAN; public DocumentSpan anchor = EMPTY_SPAN; + public DocumentSpan externalLinkText = EMPTY_SPAN; + void accept(byte code, CodedSequence positions) { if (code == HtmlTag.HEADING.code) this.heading = new DocumentSpan(positions); @@ -24,6 +26,8 @@ public class DocumentSpans { this.code = new DocumentSpan(positions); else if (code == HtmlTag.ANCHOR.code) this.anchor = new DocumentSpan(positions); + else if (code == HtmlTag.EXTERNAL_LINKTEXT.code) + this.externalLinkText = new DocumentSpan(positions); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index aa414c1e..b4349314 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -199,7 +199,7 @@ public class IndexResultScoreCalculator { final int titleLength = Math.max(1, spans.title.length()); - float coherenceScore = 0.f; + float verbatimMatchScore = 0.f; boolean verbatimMatchInTitle; boolean verbatimMatchInHeading; @@ -207,6 +207,7 @@ public class IndexResultScoreCalculator { boolean verbatimMatchInNav; boolean verbatimMatchInCode; boolean verbatimMatchInBody; + boolean verbatimMatchInExtLink; // Calculate a bonus for keyword coherences when large ones exist int largestOptional = coherences.largestOptional(); @@ -216,6 +217,7 @@ public class IndexResultScoreCalculator { verbatimMatchInAnchor = (largestOptional == coherences.testOptional(positions, spans.anchor)); verbatimMatchInNav = (largestOptional == coherences.testOptional(positions, spans.nav)); verbatimMatchInCode = (largestOptional == coherences.testOptional(positions, spans.code)); + verbatimMatchInExtLink = (largestOptional == coherences.testOptional(positions, spans.code)); verbatimMatchInBody = (largestOptional == coherences.testOptional(positions)); } else { @@ -225,29 +227,34 @@ public class IndexResultScoreCalculator { verbatimMatchInNav = false; verbatimMatchInCode = false; verbatimMatchInBody = false; + verbatimMatchInExtLink = false; } if (verbatimMatchInTitle) { // verbatim title match - coherenceScore = 4.0f * largestOptional; + verbatimMatchScore = 4.0f * largestOptional; // additional bonus if the match is most of the title's length - coherenceScore += 2.f * largestOptional / titleLength; + verbatimMatchScore += 2.f * largestOptional / titleLength; } else if (verbatimMatchInHeading) { - coherenceScore = 1.5f * largestOptional; + verbatimMatchScore = 1.5f * largestOptional; } else if (verbatimMatchInAnchor || verbatimMatchInCode) { - coherenceScore = 0.2f * largestOptional; + verbatimMatchScore = 0.2f * largestOptional; } else if (verbatimMatchInNav) { - coherenceScore = 0.1f * largestOptional; + verbatimMatchScore = 0.1f * largestOptional; } else if (verbatimMatchInBody) { - coherenceScore = 0.75f * largestOptional; + verbatimMatchScore = 0.75f * largestOptional; } if (coherences.numOptional() > 0) { - coherenceScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2); + verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2); + } + + if (verbatimMatchInExtLink) { // Nudge the result up if there is a verbatim match in the external link text + verbatimMatchScore += 1.0f * largestOptional; } float[] weightedCounts = new float[compiledQuery.size()]; @@ -318,12 +325,12 @@ public class IndexResultScoreCalculator { } if (!verbatimMatchInTitle && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) { - coherenceScore += 2.5f * unorderedMatchInTitleCount; - coherenceScore += 2.f * unorderedMatchInTitleCount / titleLength; + verbatimMatchScore += 2.5f * unorderedMatchInTitleCount; + verbatimMatchScore += 2.f * unorderedMatchInTitleCount / titleLength; } if (!verbatimMatchInHeading && unorderedMatchInHeadingCount == searchableKeywordsCount) { - coherenceScore += 2.0f * unorderedMatchInHeadingCount; + verbatimMatchScore += 2.0f * unorderedMatchInHeadingCount; } double overallPart = averageSentenceLengthPenalty @@ -333,7 +340,7 @@ public class IndexResultScoreCalculator { + topologyBonus + temporalBias + flagsPenalty - + coherenceScore + + verbatimMatchScore + keywordMinDistFac; double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));