diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java
index 5ab5d166..bf077683 100644
--- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java
+++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java
@@ -123,13 +123,13 @@ public class DocumentSpan {
/** Returns true if for any position in the list, there exists a range
* (position[i], position[i]+len] that is overlapped by a span */
- public boolean containsRangeExact(IntList positions, int len) {
+ public int containsRangeExact(IntList positions, int len) {
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
- return false;
+ return 0;
}
int sei = 0;
-
+ int cnt = 0;
int start = startsEnds.getInt(sei++);
int end = startsEnds.getInt(sei++);
@@ -138,7 +138,15 @@ public class DocumentSpan {
int position = positions.getInt(pi);
if (position == start && position + len == end) {
- return true;
+ cnt++;
+ if (sei + 2 <= startsEnds.size()) {
+ pi = 0;
+ start = startsEnds.getInt(sei++);
+ end = startsEnds.getInt(sei++);
+ }
+ else {
+ break;
+ }
}
else if (position < end) {
pi++;
@@ -147,11 +155,11 @@ public class DocumentSpan {
end = startsEnds.getInt(sei++);
}
else {
- return false;
+ break;
}
}
- return false;
+ return cnt;
}
public int countRangeMatches(IntList positions, int len) {
diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java
index f0170883..a5085c25 100644
--- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java
+++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java
@@ -115,16 +115,16 @@ class ForwardIndexSpansReaderTest {
) {
var spans1 = reader.readSpans(arena, offset1);
- assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2));
- assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
- assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(10), 2));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
- assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5));
- assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
- assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
+ assertEquals(1, spans1.heading.containsRangeExact(IntList.of(10), 5));
+ assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
+ assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
- assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5));
- assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(11), 5));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(9), 5));
}
}
diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
index 74ad0e60..10f4ea05 100644
--- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
+++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
@@ -388,11 +388,13 @@ public class IndexResultScoreCalculator {
}
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
- if (extLinkSpan.length() == fullGroup.size
- && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
- {
- score += 2; // Add additional bonus if there's a single-word atag span
+ if (extLinkSpan.length() >= fullGroup.size) {
+ int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
+ if (cnt > 0) {
+ score += 2 * cnt;
+ }
}
+
return;
}
@@ -407,9 +409,9 @@ public class IndexResultScoreCalculator {
// Bonus if there's a perfect match with an atag span
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
- if (extLinkSpan.length() == fullGroup.size && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
- {
- score += 2;
+ if (extLinkSpan.length() >= fullGroup.size) {
+ int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
+ score += 2*cnt;
}
// For optional groups, we scale the score by the size of the group relative to the full group
@@ -420,7 +422,7 @@ public class IndexResultScoreCalculator {
IntList intersections = optionalGroup.findIntersections(positions);
for (var tag : HtmlTag.includedTags) {
- int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);;
+ int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
if (cnts > 0) {
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
}
@@ -457,7 +459,7 @@ public class IndexResultScoreCalculator {
case NAV -> 0.1f;
case CODE -> 0.25f;
case BODY -> 1.0f;
- case EXTERNAL_LINKTEXT -> 0.75f;
+ case EXTERNAL_LINKTEXT -> 1.5f;
default -> 0.0f;
};
}
diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java
index 2e0b6bd7..2ee65d25 100644
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java
@@ -1,6 +1,8 @@
package nu.marginalia.atags;
import com.google.inject.Inject;
+import gnu.trove.list.TIntList;
+import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.model.Link;
import nu.marginalia.keyword.LinkTexts;
@@ -51,6 +53,7 @@ public class AnchorTextKeywords {
List keywordsRaw = links.forUrl(url);
List ret = new ArrayList<>(keywordsRaw.size());
+ TIntList counts = new TIntArrayList(keywordsRaw.size());
// Extract and count keywords from anchor text
for (Link keyword : keywordsRaw) {
@@ -59,18 +62,20 @@ public class AnchorTextKeywords {
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence);
+ counts.add(keyword.count());
}
- return new LinkTexts(ret);
+ return new LinkTexts(ret, counts);
}
public LinkTexts getAnchorTextKeywords(DomainLinks links, List urls) {
List keywordsRaw = new ArrayList<>();
for (var url : urls) {
- links.forUrl(url);
+ keywordsRaw.addAll(links.forUrl(url));
}
List ret = new ArrayList<>(keywordsRaw.size());
+ TIntList counts = new TIntArrayList(keywordsRaw.size());
// Extract and count keywords from anchor text
for (Link keyword : keywordsRaw) {
@@ -79,8 +84,9 @@ public class AnchorTextKeywords {
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence);
+ counts.add(keyword.count());
}
- return new LinkTexts(ret);
+ return new LinkTexts(ret, counts);
}
}
diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java
index 1c76469f..66d1e977 100644
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java
@@ -1,4 +1,4 @@
package nu.marginalia.atags.model;
-public record Link(String source, String text) {
+public record Link(String text, int count) {
}
diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java
index 784580fc..55986949 100644
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java
@@ -1,7 +1,7 @@
package nu.marginalia.atags.model;
-public record LinkWithText(String url, String text, String source) {
+public record LinkWithText(String url, String text, int cnt) {
public Link toLink() {
- return new Link(source, text);
+ return new Link(text, cnt);
}
}
diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java
index c80a57c7..a15dfecd 100644
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java
@@ -80,7 +80,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
select
unnest(text) as 'text',
unnest(url) as 'url',
- unnest(source) as 'source'
+ unnest(cnt) as 'cnt'
from atags
where dest = ?
"""))
@@ -89,7 +89,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
ps.setString(1, domain.toString());
var rs = ps.executeQuery();
while (rs.next()) {
- links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source")));
+ links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getInt("cnt")));
}
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
@@ -102,7 +102,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
String url = rs.getString("url");
url = aliasDomain + url.substring(url.indexOf('/'));
- links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source")));
+ links.add(new LinkWithText(url, rs.getString("text"), rs.getInt("cnt")));
}
return new DomainLinks(links);
}
diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
index d0db9b7c..7d9eae69 100644
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
@@ -1,6 +1,7 @@
package nu.marginalia.keyword;
import com.google.inject.Inject;
+import gnu.trove.list.TIntList;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
@@ -17,6 +18,9 @@ import java.util.Comparator;
import java.util.List;
import java.util.stream.Stream;
+import static java.lang.Math.min;
+import static java.lang.Math.sqrt;
+
public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor;
@@ -162,40 +166,60 @@ public class DocumentKeywordExtractor {
recorder.reset();
}
+ // ---
+
// Next add synthetic positions to the document for anchor texts
pos += 2; // add some padding to the end of the document before we start adding a-tag words
- for (var linkText : linkTexts) {
- for (var word : linkText) {
- pos++;
+ // Add
+
+ List sentences = linkTexts.linkTexts();
+ TIntList counts = linkTexts.counts();
+ SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
+
+ for (int i = 0; i < linkTexts.length(); i++) {
+
+ DocumentSentence sentence = sentences.get(i);
+
+ // We repeat a link sentence a number of times that is a function of how many times it's been spotted
+ // as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we
+ // attenuate that a bit with math so we don't generate a needlessly large positions list
+
+ final int repetitions = (int) min(sqrt(counts.get(i)), 12);
+
+ for (int ci = 0; ci < repetitions; ci++) {
+
+ for (var word : sentence) {
+ pos++;
+
+ extLinkRecorder.update(sentence, pos);
+
+ if (word.isStopWord()) {
+ continue;
+ }
+
+ String w = word.wordLowerCase();
+ if (matchesWordPattern(w)) {
+ /* Add information about term positions */
+ wordsBuilder.addPos(w, pos);
+
+ /* Add metadata for word */
+ wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
+ }
- for (var recorder : spanRecorders) {
- recorder.update(linkText, pos);
}
- if (word.isStopWord()) {
- continue;
- }
+ // Add a break between sentences, to prevent them being registered as one long run-on sentence
+ extLinkRecorder.stop(pos + 1);
- String w = word.wordLowerCase();
- if (matchesWordPattern(w)) {
- /* Add information about term positions */
- wordsBuilder.addPos(w, pos);
-
- /* Add metadata for word */
- wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
- }
+ // Also add some positional padding between separate link texts so we don't match across their boundaries
+ pos += 2;
}
-
- // add some padding between separate link texts so we don't match across their boundaries
- pos+=2;
}
- for (var recorder : spanRecorders) {
- wordsBuilder.addSpans(recorder.finish(pos));
- }
+ wordsBuilder.addSpans(extLinkRecorder.finish(pos));
}
boolean matchesWordPattern(String s) {
@@ -265,6 +289,12 @@ public class DocumentKeywordExtractor {
}
}
+ public void stop(int pos) {
+ if (start > 0) {
+ spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
+ start = 0;
+ }
+ }
public List finish(int length) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java
index c1ade6b4..0251c168 100644
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java
@@ -1,14 +1,23 @@
package nu.marginalia.keyword;
+import gnu.trove.list.TIntList;
+import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.language.model.DocumentSentence;
import org.jetbrains.annotations.NotNull;
import java.util.Iterator;
import java.util.List;
-public record LinkTexts(List linkTexts) implements Iterable {
+public record LinkTexts(
+ List linkTexts,
+ TIntList counts
+) implements Iterable {
public LinkTexts() {
- this(List.of());
+ this(List.of(), new TIntArrayList());
+ }
+
+ public int length() {
+ return linkTexts.size();
}
@NotNull