(coded-sequence) Handle weird legacy HTML that puts everything in a heading

This commit is contained in:
Viktor Lofgren 2024-08-26 12:49:15 +02:00
parent 7d471ec30d
commit 67a98fb0b0

View File

@ -251,7 +251,9 @@ public class IndexResultScoreCalculator {
int firstPosition = 1;
for (int i = 0; i < weightedCounts.length; i++) {
if (positions[i] != null && ctx.regularMask.get(i)) {
if (positions[i] == null || !ctx.regularMask.get(i))
continue;
searchableKeywordsCount ++;
int[] posArray = positions[i].toIntArray();
@ -266,7 +268,10 @@ public class IndexResultScoreCalculator {
weightedCounts[i] += 2.5f * cnt;
}
if ((cnt = spans.heading.countIntersections(posArray)) != 0) {
if (spans.heading.size() < 64) {
// Correct for the case where there's a lot of headings everywhere, or the entire document is a heading
unorderedMatchInHeadingCount++;
}
weightedCounts[i] += 2.5f * cnt;
}
if ((cnt = spans.code.countIntersections(posArray)) != 0) {
@ -282,7 +287,6 @@ public class IndexResultScoreCalculator {
weightedCounts[i] += 1.0f * cnt;
}
}
}
if (!verbatimMatches.get(HtmlTag.TITLE) && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
verbatimMatchScore += 2.5f * unorderedMatchInTitleCount;
@ -290,7 +294,7 @@ public class IndexResultScoreCalculator {
}
if (!verbatimMatches.get(HtmlTag.HEADING) && unorderedMatchInHeadingCount == searchableKeywordsCount) {
verbatimMatchScore += 2.0f * unorderedMatchInHeadingCount;
verbatimMatchScore += 1.0f * unorderedMatchInHeadingCount;
}
double overallPart = averageSentenceLengthPenalty