(converter/index) Improve atag sentence matching by taking into consideration how many times a sentence appears in the links

This change breaks the format of the atags.parquet file.
This commit is contained in:
Viktor Lofgren 2024-12-08 00:27:11 +01:00
parent ee2d5496d0
commit 291ca8daf1
9 changed files with 111 additions and 56 deletions

View File

@ -123,13 +123,13 @@ public class DocumentSpan {
/** Returns true if for any position in the list, there exists a range /** Returns true if for any position in the list, there exists a range
* (position[i], position[i]+len] that is overlapped by a span */ * (position[i], position[i]+len] that is overlapped by a span */
public boolean containsRangeExact(IntList positions, int len) { public int containsRangeExact(IntList positions, int len) {
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) { if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
return false; return 0;
} }
int sei = 0; int sei = 0;
int cnt = 0;
int start = startsEnds.getInt(sei++); int start = startsEnds.getInt(sei++);
int end = startsEnds.getInt(sei++); int end = startsEnds.getInt(sei++);
@ -138,7 +138,15 @@ public class DocumentSpan {
int position = positions.getInt(pi); int position = positions.getInt(pi);
if (position == start && position + len == end) { if (position == start && position + len == end) {
return true; cnt++;
if (sei + 2 <= startsEnds.size()) {
pi = 0;
start = startsEnds.getInt(sei++);
end = startsEnds.getInt(sei++);
}
else {
break;
}
} }
else if (position < end) { else if (position < end) {
pi++; pi++;
@ -147,11 +155,11 @@ public class DocumentSpan {
end = startsEnds.getInt(sei++); end = startsEnds.getInt(sei++);
} }
else { else {
return false; break;
} }
} }
return false; return cnt;
} }
public int countRangeMatches(IntList positions, int len) { public int countRangeMatches(IntList positions, int len) {

View File

@ -115,16 +115,16 @@ class ForwardIndexSpansReaderTest {
) { ) {
var spans1 = reader.readSpans(arena, offset1); var spans1 = reader.readSpans(arena, offset1);
assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2)); assertEquals(0, spans1.heading.containsRangeExact(IntList.of(10), 2));
assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2)); assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2)); assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5)); assertEquals(1, spans1.heading.containsRangeExact(IntList.of(10), 5));
assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5)); assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5)); assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5)); assertEquals(0, spans1.heading.containsRangeExact(IntList.of(11), 5));
assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5)); assertEquals(0, spans1.heading.containsRangeExact(IntList.of(9), 5));
} }
} }

View File

@ -388,11 +388,13 @@ public class IndexResultScoreCalculator {
} }
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT); var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
if (extLinkSpan.length() == fullGroup.size if (extLinkSpan.length() >= fullGroup.size) {
&& extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size)) int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
{ if (cnt > 0) {
score += 2; // Add additional bonus if there's a single-word atag span score += 2 * cnt;
}
} }
return; return;
} }
@ -407,9 +409,9 @@ public class IndexResultScoreCalculator {
// Bonus if there's a perfect match with an atag span // Bonus if there's a perfect match with an atag span
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT); var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
if (extLinkSpan.length() == fullGroup.size && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size)) if (extLinkSpan.length() >= fullGroup.size) {
{ int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
score += 2; score += 2*cnt;
} }
// For optional groups, we scale the score by the size of the group relative to the full group // For optional groups, we scale the score by the size of the group relative to the full group
@ -420,7 +422,7 @@ public class IndexResultScoreCalculator {
IntList intersections = optionalGroup.findIntersections(positions); IntList intersections = optionalGroup.findIntersections(positions);
for (var tag : HtmlTag.includedTags) { for (var tag : HtmlTag.includedTags) {
int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);; int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
if (cnts > 0) { if (cnts > 0) {
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts))); score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
} }
@ -457,7 +459,7 @@ public class IndexResultScoreCalculator {
case NAV -> 0.1f; case NAV -> 0.1f;
case CODE -> 0.25f; case CODE -> 0.25f;
case BODY -> 1.0f; case BODY -> 1.0f;
case EXTERNAL_LINKTEXT -> 0.75f; case EXTERNAL_LINKTEXT -> 1.5f;
default -> 0.0f; default -> 0.0f;
}; };
} }

View File

@ -1,6 +1,8 @@
package nu.marginalia.atags; package nu.marginalia.atags;
import com.google.inject.Inject; import com.google.inject.Inject;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.model.Link; import nu.marginalia.atags.model.Link;
import nu.marginalia.keyword.LinkTexts; import nu.marginalia.keyword.LinkTexts;
@ -51,6 +53,7 @@ public class AnchorTextKeywords {
List<Link> keywordsRaw = links.forUrl(url); List<Link> keywordsRaw = links.forUrl(url);
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size()); List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
TIntList counts = new TIntArrayList(keywordsRaw.size());
// Extract and count keywords from anchor text // Extract and count keywords from anchor text
for (Link keyword : keywordsRaw) { for (Link keyword : keywordsRaw) {
@ -59,18 +62,20 @@ public class AnchorTextKeywords {
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence); ret.add(sentence);
counts.add(keyword.count());
} }
return new LinkTexts(ret); return new LinkTexts(ret, counts);
} }
public LinkTexts getAnchorTextKeywords(DomainLinks links, List<EdgeUrl> urls) { public LinkTexts getAnchorTextKeywords(DomainLinks links, List<EdgeUrl> urls) {
List<Link> keywordsRaw = new ArrayList<>(); List<Link> keywordsRaw = new ArrayList<>();
for (var url : urls) { for (var url : urls) {
links.forUrl(url); keywordsRaw.addAll(links.forUrl(url));
} }
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size()); List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
TIntList counts = new TIntArrayList(keywordsRaw.size());
// Extract and count keywords from anchor text // Extract and count keywords from anchor text
for (Link keyword : keywordsRaw) { for (Link keyword : keywordsRaw) {
@ -79,8 +84,9 @@ public class AnchorTextKeywords {
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence); ret.add(sentence);
counts.add(keyword.count());
} }
return new LinkTexts(ret); return new LinkTexts(ret, counts);
} }
} }

View File

@ -1,4 +1,4 @@
package nu.marginalia.atags.model; package nu.marginalia.atags.model;
public record Link(String source, String text) { public record Link(String text, int count) {
} }

View File

@ -1,7 +1,7 @@
package nu.marginalia.atags.model; package nu.marginalia.atags.model;
public record LinkWithText(String url, String text, String source) { public record LinkWithText(String url, String text, int cnt) {
public Link toLink() { public Link toLink() {
return new Link(source, text); return new Link(text, cnt);
} }
} }

View File

@ -80,7 +80,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
select select
unnest(text) as 'text', unnest(text) as 'text',
unnest(url) as 'url', unnest(url) as 'url',
unnest(source) as 'source' unnest(cnt) as 'cnt'
from atags from atags
where dest = ? where dest = ?
""")) """))
@ -89,7 +89,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
ps.setString(1, domain.toString()); ps.setString(1, domain.toString());
var rs = ps.executeQuery(); var rs = ps.executeQuery();
while (rs.next()) { while (rs.next()) {
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source"))); links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getInt("cnt")));
} }
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu? // Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
@ -102,7 +102,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
String url = rs.getString("url"); String url = rs.getString("url");
url = aliasDomain + url.substring(url.indexOf('/')); url = aliasDomain + url.substring(url.indexOf('/'));
links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source"))); links.add(new LinkWithText(url, rs.getString("text"), rs.getInt("cnt")));
} }
return new DomainLinks(links); return new DomainLinks(links);
} }

View File

@ -1,6 +1,7 @@
package nu.marginalia.keyword; package nu.marginalia.keyword;
import com.google.inject.Inject; import com.google.inject.Inject;
import gnu.trove.list.TIntList;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
@ -17,6 +18,9 @@ import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.stream.Stream; import java.util.stream.Stream;
import static java.lang.Math.min;
import static java.lang.Math.sqrt;
public class DocumentKeywordExtractor { public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor; private final KeywordExtractor keywordExtractor;
@ -162,40 +166,60 @@ public class DocumentKeywordExtractor {
recorder.reset(); recorder.reset();
} }
// ---
// Next add synthetic positions to the document for anchor texts // Next add synthetic positions to the document for anchor texts
pos += 2; // add some padding to the end of the document before we start adding a-tag words pos += 2; // add some padding to the end of the document before we start adding a-tag words
for (var linkText : linkTexts) {
for (var word : linkText) { // Add
pos++;
List<DocumentSentence> sentences = linkTexts.linkTexts();
TIntList counts = linkTexts.counts();
SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
for (int i = 0; i < linkTexts.length(); i++) {
DocumentSentence sentence = sentences.get(i);
// We repeat a link sentence a number of times that is a function of how many times it's been spotted
// as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we
// attenuate that a bit with math so we don't generate a needlessly large positions list
final int repetitions = (int) min(sqrt(counts.get(i)), 12);
for (int ci = 0; ci < repetitions; ci++) {
for (var word : sentence) {
pos++;
extLinkRecorder.update(sentence, pos);
if (word.isStopWord()) {
continue;
}
String w = word.wordLowerCase();
if (matchesWordPattern(w)) {
/* Add information about term positions */
wordsBuilder.addPos(w, pos);
/* Add metadata for word */
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
}
for (var recorder : spanRecorders) {
recorder.update(linkText, pos);
} }
if (word.isStopWord()) { // Add a break between sentences, to prevent them being registered as one long run-on sentence
continue; extLinkRecorder.stop(pos + 1);
}
String w = word.wordLowerCase(); // Also add some positional padding between separate link texts so we don't match across their boundaries
if (matchesWordPattern(w)) { pos += 2;
/* Add information about term positions */
wordsBuilder.addPos(w, pos);
/* Add metadata for word */
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
}
} }
// add some padding between separate link texts so we don't match across their boundaries
pos+=2;
} }
for (var recorder : spanRecorders) { wordsBuilder.addSpans(extLinkRecorder.finish(pos));
wordsBuilder.addSpans(recorder.finish(pos));
}
} }
boolean matchesWordPattern(String s) { boolean matchesWordPattern(String s) {
@ -265,6 +289,12 @@ public class DocumentKeywordExtractor {
} }
} }
public void stop(int pos) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) { public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
if (start > 0) { if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));

View File

@ -1,14 +1,23 @@
package nu.marginalia.keyword; package nu.marginalia.keyword;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.DocumentSentence;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
public record LinkTexts(List<DocumentSentence> linkTexts) implements Iterable<DocumentSentence> { public record LinkTexts(
List<DocumentSentence> linkTexts,
TIntList counts
) implements Iterable<DocumentSentence> {
public LinkTexts() { public LinkTexts() {
this(List.of()); this(List.of(), new TIntArrayList());
}
public int length() {
return linkTexts.size();
} }
@NotNull @NotNull