From 291ca8daf1e44df0fb3f19080352e2ba3496a546 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 8 Dec 2024 00:27:11 +0100 Subject: [PATCH] (converter/index) Improve atag sentence matching by taking into consideration how many times a sentence appears in the links This change breaks the format of the atags.parquet file. --- .../index/forward/spans/DocumentSpan.java | 20 +++-- .../forward/ForwardIndexSpansReaderTest.java | 16 ++-- .../results/IndexResultScoreCalculator.java | 20 ++--- .../marginalia/atags/AnchorTextKeywords.java | 12 ++- .../java/nu/marginalia/atags/model/Link.java | 2 +- .../marginalia/atags/model/LinkWithText.java | 4 +- .../atags/source/AnchorTagsImpl.java | 6 +- .../keyword/DocumentKeywordExtractor.java | 74 +++++++++++++------ .../java/nu/marginalia/keyword/LinkTexts.java | 13 +++- 9 files changed, 111 insertions(+), 56 deletions(-) diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java index 5ab5d166..bf077683 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java @@ -123,13 +123,13 @@ public class DocumentSpan { /** Returns true if for any position in the list, there exists a range * (position[i], position[i]+len] that is overlapped by a span */ - public boolean containsRangeExact(IntList positions, int len) { + public int containsRangeExact(IntList positions, int len) { if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) { - return false; + return 0; } int sei = 0; - + int cnt = 0; int start = startsEnds.getInt(sei++); int end = startsEnds.getInt(sei++); @@ -138,7 +138,15 @@ public class DocumentSpan { int position = positions.getInt(pi); if (position == start && position + len == end) { - return true; + cnt++; + if (sei + 2 <= startsEnds.size()) { + pi = 0; + start = startsEnds.getInt(sei++); + end = startsEnds.getInt(sei++); + } + else { + break; + } } else if (position < end) { pi++; @@ -147,11 +155,11 @@ public class DocumentSpan { end = startsEnds.getInt(sei++); } else { - return false; + break; } } - return false; + return cnt; } public int countRangeMatches(IntList positions, int len) { diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java index f0170883..a5085c25 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java @@ -115,16 +115,16 @@ class ForwardIndexSpansReaderTest { ) { var spans1 = reader.readSpans(arena, offset1); - assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(10), 2)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10), 2)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2)); - assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5)); - assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5)); - assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5)); + assertEquals(1, spans1.heading.containsRangeExact(IntList.of(10), 5)); + assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10), 5)); + assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(11), 5)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(9), 5)); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 74ad0e60..10f4ea05 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -388,11 +388,13 @@ public class IndexResultScoreCalculator { } var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT); - if (extLinkSpan.length() == fullGroup.size - && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size)) - { - score += 2; // Add additional bonus if there's a single-word atag span + if (extLinkSpan.length() >= fullGroup.size) { + int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size); + if (cnt > 0) { + score += 2 * cnt; + } } + return; } @@ -407,9 +409,9 @@ public class IndexResultScoreCalculator { // Bonus if there's a perfect match with an atag span var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT); - if (extLinkSpan.length() == fullGroup.size && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size)) - { - score += 2; + if (extLinkSpan.length() >= fullGroup.size) { + int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size); + score += 2*cnt; } // For optional groups, we scale the score by the size of the group relative to the full group @@ -420,7 +422,7 @@ public class IndexResultScoreCalculator { IntList intersections = optionalGroup.findIntersections(positions); for (var tag : HtmlTag.includedTags) { - int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);; + int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size); if (cnts > 0) { score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts))); } @@ -457,7 +459,7 @@ public class IndexResultScoreCalculator { case NAV -> 0.1f; case CODE -> 0.25f; case BODY -> 1.0f; - case EXTERNAL_LINKTEXT -> 0.75f; + case EXTERNAL_LINKTEXT -> 1.5f; default -> 0.0f; }; } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java index 2e0b6bd7..2ee65d25 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java @@ -1,6 +1,8 @@ package nu.marginalia.atags; import com.google.inject.Inject; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.Link; import nu.marginalia.keyword.LinkTexts; @@ -51,6 +53,7 @@ public class AnchorTextKeywords { List keywordsRaw = links.forUrl(url); List ret = new ArrayList<>(keywordsRaw.size()); + TIntList counts = new TIntArrayList(keywordsRaw.size()); // Extract and count keywords from anchor text for (Link keyword : keywordsRaw) { @@ -59,18 +62,20 @@ public class AnchorTextKeywords { var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); ret.add(sentence); + counts.add(keyword.count()); } - return new LinkTexts(ret); + return new LinkTexts(ret, counts); } public LinkTexts getAnchorTextKeywords(DomainLinks links, List urls) { List keywordsRaw = new ArrayList<>(); for (var url : urls) { - links.forUrl(url); + keywordsRaw.addAll(links.forUrl(url)); } List ret = new ArrayList<>(keywordsRaw.size()); + TIntList counts = new TIntArrayList(keywordsRaw.size()); // Extract and count keywords from anchor text for (Link keyword : keywordsRaw) { @@ -79,8 +84,9 @@ public class AnchorTextKeywords { var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); ret.add(sentence); + counts.add(keyword.count()); } - return new LinkTexts(ret); + return new LinkTexts(ret, counts); } } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java index 1c76469f..66d1e977 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java @@ -1,4 +1,4 @@ package nu.marginalia.atags.model; -public record Link(String source, String text) { +public record Link(String text, int count) { } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java index 784580fc..55986949 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java @@ -1,7 +1,7 @@ package nu.marginalia.atags.model; -public record LinkWithText(String url, String text, String source) { +public record LinkWithText(String url, String text, int cnt) { public Link toLink() { - return new Link(source, text); + return new Link(text, cnt); } } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java index c80a57c7..a15dfecd 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java @@ -80,7 +80,7 @@ public class AnchorTagsImpl implements AnchorTagsSource { select unnest(text) as 'text', unnest(url) as 'url', - unnest(source) as 'source' + unnest(cnt) as 'cnt' from atags where dest = ? """)) @@ -89,7 +89,7 @@ public class AnchorTagsImpl implements AnchorTagsSource { ps.setString(1, domain.toString()); var rs = ps.executeQuery(); while (rs.next()) { - links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source"))); + links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getInt("cnt"))); } // Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu? @@ -102,7 +102,7 @@ public class AnchorTagsImpl implements AnchorTagsSource { String url = rs.getString("url"); url = aliasDomain + url.substring(url.indexOf('/')); - links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source"))); + links.add(new LinkWithText(url, rs.getString("text"), rs.getInt("cnt"))); } return new DomainLinks(links); } diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index d0db9b7c..7d9eae69 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -1,6 +1,7 @@ package nu.marginalia.keyword; import com.google.inject.Inject; +import gnu.trove.list.TIntList; import nu.marginalia.WmsaHome; import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; @@ -17,6 +18,9 @@ import java.util.Comparator; import java.util.List; import java.util.stream.Stream; +import static java.lang.Math.min; +import static java.lang.Math.sqrt; + public class DocumentKeywordExtractor { private final KeywordExtractor keywordExtractor; @@ -162,40 +166,60 @@ public class DocumentKeywordExtractor { recorder.reset(); } + // --- + // Next add synthetic positions to the document for anchor texts pos += 2; // add some padding to the end of the document before we start adding a-tag words - for (var linkText : linkTexts) { - for (var word : linkText) { - pos++; + // Add + + List sentences = linkTexts.linkTexts(); + TIntList counts = linkTexts.counts(); + SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT); + + for (int i = 0; i < linkTexts.length(); i++) { + + DocumentSentence sentence = sentences.get(i); + + // We repeat a link sentence a number of times that is a function of how many times it's been spotted + // as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we + // attenuate that a bit with math so we don't generate a needlessly large positions list + + final int repetitions = (int) min(sqrt(counts.get(i)), 12); + + for (int ci = 0; ci < repetitions; ci++) { + + for (var word : sentence) { + pos++; + + extLinkRecorder.update(sentence, pos); + + if (word.isStopWord()) { + continue; + } + + String w = word.wordLowerCase(); + if (matchesWordPattern(w)) { + /* Add information about term positions */ + wordsBuilder.addPos(w, pos); + + /* Add metadata for word */ + wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); + } - for (var recorder : spanRecorders) { - recorder.update(linkText, pos); } - if (word.isStopWord()) { - continue; - } + // Add a break between sentences, to prevent them being registered as one long run-on sentence + extLinkRecorder.stop(pos + 1); - String w = word.wordLowerCase(); - if (matchesWordPattern(w)) { - /* Add information about term positions */ - wordsBuilder.addPos(w, pos); - - /* Add metadata for word */ - wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); - } + // Also add some positional padding between separate link texts so we don't match across their boundaries + pos += 2; } - - // add some padding between separate link texts so we don't match across their boundaries - pos+=2; } - for (var recorder : spanRecorders) { - wordsBuilder.addSpans(recorder.finish(pos)); - } + wordsBuilder.addSpans(extLinkRecorder.finish(pos)); } boolean matchesWordPattern(String s) { @@ -265,6 +289,12 @@ public class DocumentKeywordExtractor { } } + public void stop(int pos) { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); + start = 0; + } + } public List finish(int length) { if (start > 0) { spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java index c1ade6b4..0251c168 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java @@ -1,14 +1,23 @@ package nu.marginalia.keyword; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; import nu.marginalia.language.model.DocumentSentence; import org.jetbrains.annotations.NotNull; import java.util.Iterator; import java.util.List; -public record LinkTexts(List linkTexts) implements Iterable { +public record LinkTexts( + List linkTexts, + TIntList counts +) implements Iterable { public LinkTexts() { - this(List.of()); + this(List.of(), new TIntArrayList()); + } + + public int length() { + return linkTexts.size(); } @NotNull