mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Simplify verbatim match calculation
This commit is contained in:
parent
4264fb9f49
commit
41b52f5bcd
@ -24,6 +24,7 @@ dependencies {
|
|||||||
implementation project(':code:libraries:btree')
|
implementation project(':code:libraries:btree')
|
||||||
implementation project(':code:libraries:slop')
|
implementation project(':code:libraries:slop')
|
||||||
implementation project(':code:libraries:coded-sequence')
|
implementation project(':code:libraries:coded-sequence')
|
||||||
|
implementation project(':code:libraries:language-processing')
|
||||||
|
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
|
@ -15,6 +15,22 @@ public class DocumentSpans {
|
|||||||
|
|
||||||
public DocumentSpan externalLinkText = EMPTY_SPAN;
|
public DocumentSpan externalLinkText = EMPTY_SPAN;
|
||||||
|
|
||||||
|
public DocumentSpan getSpan(HtmlTag tag) {
|
||||||
|
if (tag == HtmlTag.HEADING)
|
||||||
|
return heading;
|
||||||
|
else if (tag == HtmlTag.TITLE)
|
||||||
|
return title;
|
||||||
|
else if (tag == HtmlTag.NAV)
|
||||||
|
return nav;
|
||||||
|
else if (tag == HtmlTag.CODE)
|
||||||
|
return code;
|
||||||
|
else if (tag == HtmlTag.ANCHOR)
|
||||||
|
return anchor;
|
||||||
|
else if (tag == HtmlTag.EXTERNAL_LINKTEXT)
|
||||||
|
return externalLinkText;
|
||||||
|
return EMPTY_SPAN;
|
||||||
|
}
|
||||||
|
|
||||||
void accept(byte code, CodedSequence positions) {
|
void accept(byte code, CodedSequence positions) {
|
||||||
if (code == HtmlTag.HEADING.code)
|
if (code == HtmlTag.HEADING.code)
|
||||||
this.heading = new DocumentSpan(positions);
|
this.heading = new DocumentSpan(positions);
|
||||||
|
@ -15,6 +15,7 @@ import nu.marginalia.index.model.SearchParameters;
|
|||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||||
import nu.marginalia.index.results.model.TermCoherenceGroupList;
|
import nu.marginalia.index.results.model.TermCoherenceGroupList;
|
||||||
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
@ -27,6 +28,7 @@ import nu.marginalia.sequence.SequenceOperations;
|
|||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.lang.foreign.Arena;
|
import java.lang.foreign.Arena;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.BitSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
||||||
@ -137,6 +139,8 @@ public class IndexResultScoreCalculator {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public double calculateSearchResultValue(DebugRankingFactors rankingFactors,
|
public double calculateSearchResultValue(DebugRankingFactors rankingFactors,
|
||||||
QuerySearchTerms searchTerms,
|
QuerySearchTerms searchTerms,
|
||||||
CompiledQueryLong wordFlagsQuery,
|
CompiledQueryLong wordFlagsQuery,
|
||||||
@ -181,67 +185,13 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
final int titleLength = Math.max(1, spans.title.length());
|
final int titleLength = Math.max(1, spans.title.length());
|
||||||
|
|
||||||
float verbatimMatchScore = 0.f;
|
VerbatimMatches verbatimMatches = new VerbatimMatches();
|
||||||
|
|
||||||
boolean verbatimMatchInTitle;
|
|
||||||
boolean verbatimMatchInHeading;
|
|
||||||
boolean verbatimMatchInAnchor;
|
|
||||||
boolean verbatimMatchInNav;
|
|
||||||
boolean verbatimMatchInCode;
|
|
||||||
boolean verbatimMatchInBody;
|
|
||||||
boolean verbatimMatchInExtLink;
|
|
||||||
|
|
||||||
// Calculate a bonus for keyword coherences when large ones exist
|
|
||||||
int largestOptional = coherences.largestOptional();
|
|
||||||
if (largestOptional >= 2) {
|
|
||||||
verbatimMatchInTitle = (largestOptional == coherences.testOptional(positions, spans.title));
|
|
||||||
verbatimMatchInHeading = (largestOptional == coherences.testOptional(positions, spans.heading));
|
|
||||||
verbatimMatchInAnchor = (largestOptional == coherences.testOptional(positions, spans.anchor));
|
|
||||||
verbatimMatchInNav = (largestOptional == coherences.testOptional(positions, spans.nav));
|
|
||||||
verbatimMatchInCode = (largestOptional == coherences.testOptional(positions, spans.code));
|
|
||||||
verbatimMatchInExtLink = (largestOptional == coherences.testOptional(positions, spans.code));
|
|
||||||
verbatimMatchInBody = (largestOptional == coherences.testOptional(positions));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
verbatimMatchInTitle = false;
|
|
||||||
verbatimMatchInHeading = false;
|
|
||||||
verbatimMatchInAnchor = false;
|
|
||||||
verbatimMatchInNav = false;
|
|
||||||
verbatimMatchInCode = false;
|
|
||||||
verbatimMatchInBody = false;
|
|
||||||
verbatimMatchInExtLink = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (verbatimMatchInTitle) {
|
float verbatimMatchScore = findVerbatimMatches(verbatimMatches, coherences, positions, spans);
|
||||||
// verbatim title match
|
|
||||||
verbatimMatchScore = 4.0f * largestOptional;
|
|
||||||
// additional bonus if the match is most of the title's length
|
|
||||||
verbatimMatchScore += 2.f * largestOptional / titleLength;
|
|
||||||
}
|
|
||||||
else if (verbatimMatchInHeading) {
|
|
||||||
verbatimMatchScore = 1.5f * largestOptional;
|
|
||||||
}
|
|
||||||
else if (verbatimMatchInAnchor || verbatimMatchInCode) {
|
|
||||||
verbatimMatchScore = 0.2f * largestOptional;
|
|
||||||
}
|
|
||||||
else if (verbatimMatchInNav) {
|
|
||||||
verbatimMatchScore = 0.1f * largestOptional;
|
|
||||||
}
|
|
||||||
else if (verbatimMatchInBody) {
|
|
||||||
verbatimMatchScore = 0.75f * largestOptional;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (coherences.numOptional() > 0) {
|
|
||||||
verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (verbatimMatchInExtLink) { // Nudge the result up if there is a verbatim match in the external link text
|
|
||||||
verbatimMatchScore += 1.0f * largestOptional;
|
|
||||||
}
|
|
||||||
|
|
||||||
float[] weightedCounts = new float[compiledQuery.size()];
|
float[] weightedCounts = new float[compiledQuery.size()];
|
||||||
int firstPosition = Integer.MAX_VALUE;
|
|
||||||
|
|
||||||
float keywordMinDistFac = 0;
|
float keywordMinDistFac = 0;
|
||||||
if (positions.length > 2) {
|
if (positions.length > 2) {
|
||||||
List<IntIterator> iterators = new ArrayList<>(positions.length);
|
List<IntIterator> iterators = new ArrayList<>(positions.length);
|
||||||
@ -268,6 +218,7 @@ public class IndexResultScoreCalculator {
|
|||||||
int unorderedMatchInTitleCount = 0;
|
int unorderedMatchInTitleCount = 0;
|
||||||
int unorderedMatchInHeadingCount = 0;
|
int unorderedMatchInHeadingCount = 0;
|
||||||
|
|
||||||
|
int firstPosition = 0;
|
||||||
for (int i = 0; i < weightedCounts.length; i++) {
|
for (int i = 0; i < weightedCounts.length; i++) {
|
||||||
if (positions[i] != null && ctx.regularMask.get(i)) {
|
if (positions[i] != null && ctx.regularMask.get(i)) {
|
||||||
searchableKeywordsCount ++;
|
searchableKeywordsCount ++;
|
||||||
@ -312,12 +263,12 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!verbatimMatchInTitle && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
|
if (!verbatimMatches.get(HtmlTag.TITLE) && searchableKeywordsCount > 2 && unorderedMatchInTitleCount == searchableKeywordsCount) {
|
||||||
verbatimMatchScore += 2.5f * unorderedMatchInTitleCount;
|
verbatimMatchScore += 2.5f * unorderedMatchInTitleCount;
|
||||||
verbatimMatchScore += 2.f * unorderedMatchInTitleCount / titleLength;
|
verbatimMatchScore += 2.f * unorderedMatchInTitleCount / titleLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!verbatimMatchInHeading && unorderedMatchInHeadingCount == searchableKeywordsCount) {
|
if (!verbatimMatches.get(HtmlTag.HEADING) && unorderedMatchInHeadingCount == searchableKeywordsCount) {
|
||||||
verbatimMatchScore += 2.0f * unorderedMatchInHeadingCount;
|
verbatimMatchScore += 2.0f * unorderedMatchInHeadingCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -373,26 +324,10 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (verbatimMatchInAnchor) {
|
for (HtmlTag tag : HtmlTag.includedTags) {
|
||||||
rankingFactors.addTermFactor(termId, "verbatim.anchor", "true");
|
if (verbatimMatches.get(tag)) {
|
||||||
}
|
rankingFactors.addTermFactor(termId, "verbatim." + tag.name().toLowerCase(), "true");
|
||||||
if (verbatimMatchInBody) {
|
}
|
||||||
rankingFactors.addTermFactor(termId, "verbatim.body", "true");
|
|
||||||
}
|
|
||||||
if (verbatimMatchInCode) {
|
|
||||||
rankingFactors.addTermFactor(termId, "verbatim.code", "true");
|
|
||||||
}
|
|
||||||
if (verbatimMatchInExtLink) {
|
|
||||||
rankingFactors.addTermFactor(termId, "verbatim.extLink", "true");
|
|
||||||
}
|
|
||||||
if (verbatimMatchInHeading) {
|
|
||||||
rankingFactors.addTermFactor(termId, "verbatim.heading", "true");
|
|
||||||
}
|
|
||||||
if (verbatimMatchInNav) {
|
|
||||||
rankingFactors.addTermFactor(termId, "verbatim.nav", "true");
|
|
||||||
}
|
|
||||||
if (verbatimMatchInTitle) {
|
|
||||||
rankingFactors.addTermFactor(termId, "verbatim.title", "true");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (positions[i] != null) {
|
if (positions[i] != null) {
|
||||||
@ -430,6 +365,82 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private float findVerbatimMatches(VerbatimMatches verbatimMatches,
|
||||||
|
TermCoherenceGroupList coherences,
|
||||||
|
CodedSequence[] positions,
|
||||||
|
DocumentSpans spans) {
|
||||||
|
|
||||||
|
// Calculate a bonus for keyword coherences when large ones exist
|
||||||
|
int largestOptional = coherences.largestOptional();
|
||||||
|
if (largestOptional < 2) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
float verbatimMatchScore = 0.f;
|
||||||
|
|
||||||
|
for (var optionalGroup : coherences.getOptionalGroups()) {
|
||||||
|
int groupSize = optionalGroup.size;
|
||||||
|
float sizeScalingFactor = groupSize / (float) largestOptional;
|
||||||
|
|
||||||
|
for (var tag : HtmlTag.includedTags) {
|
||||||
|
if (optionalGroup.test(spans.getSpan(tag), positions)) {
|
||||||
|
verbatimMatchScore += verbatimMatches.getWeight(tag) * sizeScalingFactor * groupSize;
|
||||||
|
|
||||||
|
if (optionalGroup.size == largestOptional) {
|
||||||
|
verbatimMatches.set(tag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (coherences.numOptional() > 0) {
|
||||||
|
verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return verbatimMatchScore;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class VerbatimMatches {
|
||||||
|
private final BitSet matches;
|
||||||
|
private final float[] weights;
|
||||||
|
|
||||||
|
public VerbatimMatches() {
|
||||||
|
matches = new BitSet(HtmlTag.includedTags.length);
|
||||||
|
weights = new float[] { HtmlTag.includedTags.length };
|
||||||
|
|
||||||
|
for (int i = 0; i < weights.length; i++) {
|
||||||
|
weights[i] = switch(HtmlTag.includedTags[i]) {
|
||||||
|
case TITLE -> 4.0f;
|
||||||
|
case HEADING -> 1.5f;
|
||||||
|
case ANCHOR -> 0.2f;
|
||||||
|
case NAV -> 0.1f;
|
||||||
|
case CODE -> 0.25f;
|
||||||
|
case EXTERNAL_LINKTEXT -> 1.0f;
|
||||||
|
default -> 0.0f;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean get(HtmlTag tag) {
|
||||||
|
assert !tag.exclude;
|
||||||
|
return matches.get(tag.ordinal());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void set(HtmlTag tag) {
|
||||||
|
assert !tag.exclude;
|
||||||
|
matches.set(tag.ordinal());
|
||||||
|
}
|
||||||
|
|
||||||
|
public float getWeight(HtmlTag tag) {
|
||||||
|
assert !tag.exclude;
|
||||||
|
return weights[tag.ordinal()];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
||||||
if (size < 400) {
|
if (size < 400) {
|
||||||
if (quality < 5)
|
if (quality < 5)
|
||||||
|
@ -10,6 +10,7 @@ import nu.marginalia.sequence.SequenceOperations;
|
|||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -29,6 +30,10 @@ public class TermCoherenceGroupList {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<TermCoherenceGroup> getOptionalGroups() {
|
||||||
|
return Collections.unmodifiableList(optionalGroups);
|
||||||
|
}
|
||||||
|
|
||||||
public boolean testMandatory(CodedSequence[] positions) {
|
public boolean testMandatory(CodedSequence[] positions) {
|
||||||
|
|
||||||
for (var coherenceSet : mandatoryGroups) {
|
for (var coherenceSet : mandatoryGroups) {
|
||||||
|
@ -1,24 +1,27 @@
|
|||||||
package nu.marginalia.language.sentence.tag;
|
package nu.marginalia.language.sentence.tag;
|
||||||
|
|
||||||
public enum HtmlTag {
|
import java.util.Arrays;
|
||||||
FORM((byte) 0, true, false),
|
|
||||||
SCRIPT((byte) 0, true, false),
|
|
||||||
STYLE((byte) 0, true, false),
|
|
||||||
|
|
||||||
|
public enum HtmlTag {
|
||||||
ANCHOR((byte) 'a', false, false),
|
ANCHOR((byte) 'a', false, false),
|
||||||
TITLE((byte) 't', false, false),
|
TITLE((byte) 't', false, false),
|
||||||
HEADING((byte) 'h', false, false),
|
HEADING((byte) 'h', false, false),
|
||||||
CODE((byte) 'c', false, true),
|
CODE((byte) 'c', false, true),
|
||||||
NAV((byte) 'n', false, false),
|
NAV((byte) 'n', false, false),
|
||||||
|
|
||||||
// pseudo-tags for internal use
|
// pseudo-tags for internal use,
|
||||||
|
BODY((byte) 'b', false, false),
|
||||||
EXTERNAL_LINKTEXT((byte) 'x', false, false),
|
EXTERNAL_LINKTEXT((byte) 'x', false, false),
|
||||||
|
|
||||||
|
// excluded tags must be put last!
|
||||||
|
FORM((byte) 0, true, false),
|
||||||
|
SCRIPT((byte) 0, true, false),
|
||||||
|
STYLE((byte) 0, true, false),
|
||||||
;
|
;
|
||||||
|
|
||||||
public byte code;
|
public final byte code;
|
||||||
public boolean exclude;
|
public final boolean exclude;
|
||||||
public boolean nonLanguage;
|
public final boolean nonLanguage;
|
||||||
|
|
||||||
HtmlTag(byte code, boolean exclude, boolean nonLanguage) {
|
HtmlTag(byte code, boolean exclude, boolean nonLanguage) {
|
||||||
this.code = code;
|
this.code = code;
|
||||||
@ -26,4 +29,21 @@ public enum HtmlTag {
|
|||||||
this.nonLanguage = nonLanguage;
|
this.nonLanguage = nonLanguage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This is a bit of a hack to get the included tags in the order they are defined in the enum
|
||||||
|
public static final HtmlTag[] includedTags;
|
||||||
|
|
||||||
|
static {
|
||||||
|
HtmlTag[] values = values();
|
||||||
|
includedTags = new HtmlTag[(int) Arrays.stream(values).filter(tag -> !tag.exclude).count()];
|
||||||
|
|
||||||
|
for (int i = 0; i < values.length; i++) {
|
||||||
|
if (i != values[i].ordinal()) {
|
||||||
|
throw new IllegalStateException("Excluded tags must be put last");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!values()[i].exclude) {
|
||||||
|
includedTags[i] = values()[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user