mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(coded-sequence) Handle weird legacy HTML that puts everything in a heading
This commit is contained in:
parent
7d471ec30d
commit
67a98fb0b0
@ -251,7 +251,9 @@ public class IndexResultScoreCalculator {
|
||||
int firstPosition = 1;
|
||||
for (int i = 0; i < weightedCounts.length; i++) {
|
||||
|
||||
if (positions[i] != null && ctx.regularMask.get(i)) {
|
||||
if (positions[i] == null || !ctx.regularMask.get(i))
|
||||
continue;
|
||||
|
||||
searchableKeywordsCount ++;
|
||||
int[] posArray = positions[i].toIntArray();
|
||||
|
||||
@ -266,7 +268,10 @@ public class IndexResultScoreCalculator {
|
||||
weightedCounts[i] += 2.5f * cnt;
|
||||
}
|
||||
if ((cnt = spans.heading.countIntersections(posArray)) != 0) {
|
||||
if (spans.heading.size() < 64) {
|
||||
// Correct for the case where there's a lot of headings everywhere, or the entire document is a heading
|
||||
unorderedMatchInHeadingCount++;
|
||||
}
|
||||
weightedCounts[i] += 2.5f * cnt;
|
||||
}
|
||||
if ((cnt = spans.code.countIntersections(posArray)) != 0) {
|
||||
@ -282,7 +287,6 @@ public class IndexResultScoreCalculator {
|
||||
weightedCounts[i] += 1.0f * cnt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!verbatimMatches.get(HtmlTag.TITLE) && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
|
||||
verbatimMatchScore += 2.5f * unorderedMatchInTitleCount;
|
||||
@ -290,7 +294,7 @@ public class IndexResultScoreCalculator {
|
||||
}
|
||||
|
||||
if (!verbatimMatches.get(HtmlTag.HEADING) && unorderedMatchInHeadingCount == searchableKeywordsCount) {
|
||||
verbatimMatchScore += 2.0f * unorderedMatchInHeadingCount;
|
||||
verbatimMatchScore += 1.0f * unorderedMatchInHeadingCount;
|
||||
}
|
||||
|
||||
double overallPart = averageSentenceLengthPenalty
|
||||
|
Loading…
Reference in New Issue
Block a user