mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter/index) Improve atag sentence matching by taking into consideration how many times a sentence appears in the links
This change breaks the format of the atags.parquet file.
This commit is contained in:
parent
ee2d5496d0
commit
291ca8daf1
@ -123,13 +123,13 @@ public class DocumentSpan {
|
|||||||
|
|
||||||
/** Returns true if for any position in the list, there exists a range
|
/** Returns true if for any position in the list, there exists a range
|
||||||
* (position[i], position[i]+len] that is overlapped by a span */
|
* (position[i], position[i]+len] that is overlapped by a span */
|
||||||
public boolean containsRangeExact(IntList positions, int len) {
|
public int containsRangeExact(IntList positions, int len) {
|
||||||
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
|
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
|
||||||
return false;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int sei = 0;
|
int sei = 0;
|
||||||
|
int cnt = 0;
|
||||||
|
|
||||||
int start = startsEnds.getInt(sei++);
|
int start = startsEnds.getInt(sei++);
|
||||||
int end = startsEnds.getInt(sei++);
|
int end = startsEnds.getInt(sei++);
|
||||||
@ -138,7 +138,15 @@ public class DocumentSpan {
|
|||||||
int position = positions.getInt(pi);
|
int position = positions.getInt(pi);
|
||||||
|
|
||||||
if (position == start && position + len == end) {
|
if (position == start && position + len == end) {
|
||||||
return true;
|
cnt++;
|
||||||
|
if (sei + 2 <= startsEnds.size()) {
|
||||||
|
pi = 0;
|
||||||
|
start = startsEnds.getInt(sei++);
|
||||||
|
end = startsEnds.getInt(sei++);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (position < end) {
|
else if (position < end) {
|
||||||
pi++;
|
pi++;
|
||||||
@ -147,11 +155,11 @@ public class DocumentSpan {
|
|||||||
end = startsEnds.getInt(sei++);
|
end = startsEnds.getInt(sei++);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return false;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return cnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int countRangeMatches(IntList positions, int len) {
|
public int countRangeMatches(IntList positions, int len) {
|
||||||
|
@ -115,16 +115,16 @@ class ForwardIndexSpansReaderTest {
|
|||||||
) {
|
) {
|
||||||
var spans1 = reader.readSpans(arena, offset1);
|
var spans1 = reader.readSpans(arena, offset1);
|
||||||
|
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(10), 2));
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
|
||||||
|
|
||||||
assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5));
|
assertEquals(1, spans1.heading.containsRangeExact(IntList.of(10), 5));
|
||||||
assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
|
assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
|
||||||
assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
|
assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
|
||||||
|
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(11), 5));
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(9), 5));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -388,11 +388,13 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
if (extLinkSpan.length() == fullGroup.size
|
if (extLinkSpan.length() >= fullGroup.size) {
|
||||||
&& extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
|
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
|
||||||
{
|
if (cnt > 0) {
|
||||||
score += 2; // Add additional bonus if there's a single-word atag span
|
score += 2 * cnt;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -407,9 +409,9 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
// Bonus if there's a perfect match with an atag span
|
// Bonus if there's a perfect match with an atag span
|
||||||
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
if (extLinkSpan.length() == fullGroup.size && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
|
if (extLinkSpan.length() >= fullGroup.size) {
|
||||||
{
|
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
|
||||||
score += 2;
|
score += 2*cnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
// For optional groups, we scale the score by the size of the group relative to the full group
|
// For optional groups, we scale the score by the size of the group relative to the full group
|
||||||
@ -420,7 +422,7 @@ public class IndexResultScoreCalculator {
|
|||||||
IntList intersections = optionalGroup.findIntersections(positions);
|
IntList intersections = optionalGroup.findIntersections(positions);
|
||||||
|
|
||||||
for (var tag : HtmlTag.includedTags) {
|
for (var tag : HtmlTag.includedTags) {
|
||||||
int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);;
|
int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
|
||||||
if (cnts > 0) {
|
if (cnts > 0) {
|
||||||
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
|
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
|
||||||
}
|
}
|
||||||
@ -457,7 +459,7 @@ public class IndexResultScoreCalculator {
|
|||||||
case NAV -> 0.1f;
|
case NAV -> 0.1f;
|
||||||
case CODE -> 0.25f;
|
case CODE -> 0.25f;
|
||||||
case BODY -> 1.0f;
|
case BODY -> 1.0f;
|
||||||
case EXTERNAL_LINKTEXT -> 0.75f;
|
case EXTERNAL_LINKTEXT -> 1.5f;
|
||||||
default -> 0.0f;
|
default -> 0.0f;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.atags;
|
package nu.marginalia.atags;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import gnu.trove.list.TIntList;
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.atags.model.Link;
|
import nu.marginalia.atags.model.Link;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
@ -51,6 +53,7 @@ public class AnchorTextKeywords {
|
|||||||
List<Link> keywordsRaw = links.forUrl(url);
|
List<Link> keywordsRaw = links.forUrl(url);
|
||||||
|
|
||||||
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
|
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
|
||||||
|
TIntList counts = new TIntArrayList(keywordsRaw.size());
|
||||||
|
|
||||||
// Extract and count keywords from anchor text
|
// Extract and count keywords from anchor text
|
||||||
for (Link keyword : keywordsRaw) {
|
for (Link keyword : keywordsRaw) {
|
||||||
@ -59,18 +62,20 @@ public class AnchorTextKeywords {
|
|||||||
|
|
||||||
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
ret.add(sentence);
|
ret.add(sentence);
|
||||||
|
counts.add(keyword.count());
|
||||||
}
|
}
|
||||||
|
|
||||||
return new LinkTexts(ret);
|
return new LinkTexts(ret, counts);
|
||||||
}
|
}
|
||||||
|
|
||||||
public LinkTexts getAnchorTextKeywords(DomainLinks links, List<EdgeUrl> urls) {
|
public LinkTexts getAnchorTextKeywords(DomainLinks links, List<EdgeUrl> urls) {
|
||||||
List<Link> keywordsRaw = new ArrayList<>();
|
List<Link> keywordsRaw = new ArrayList<>();
|
||||||
for (var url : urls) {
|
for (var url : urls) {
|
||||||
links.forUrl(url);
|
keywordsRaw.addAll(links.forUrl(url));
|
||||||
}
|
}
|
||||||
|
|
||||||
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
|
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
|
||||||
|
TIntList counts = new TIntArrayList(keywordsRaw.size());
|
||||||
|
|
||||||
// Extract and count keywords from anchor text
|
// Extract and count keywords from anchor text
|
||||||
for (Link keyword : keywordsRaw) {
|
for (Link keyword : keywordsRaw) {
|
||||||
@ -79,8 +84,9 @@ public class AnchorTextKeywords {
|
|||||||
|
|
||||||
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
ret.add(sentence);
|
ret.add(sentence);
|
||||||
|
counts.add(keyword.count());
|
||||||
}
|
}
|
||||||
|
|
||||||
return new LinkTexts(ret);
|
return new LinkTexts(ret, counts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.atags.model;
|
package nu.marginalia.atags.model;
|
||||||
|
|
||||||
public record Link(String source, String text) {
|
public record Link(String text, int count) {
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.atags.model;
|
package nu.marginalia.atags.model;
|
||||||
|
|
||||||
public record LinkWithText(String url, String text, String source) {
|
public record LinkWithText(String url, String text, int cnt) {
|
||||||
public Link toLink() {
|
public Link toLink() {
|
||||||
return new Link(source, text);
|
return new Link(text, cnt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -80,7 +80,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
|
|||||||
select
|
select
|
||||||
unnest(text) as 'text',
|
unnest(text) as 'text',
|
||||||
unnest(url) as 'url',
|
unnest(url) as 'url',
|
||||||
unnest(source) as 'source'
|
unnest(cnt) as 'cnt'
|
||||||
from atags
|
from atags
|
||||||
where dest = ?
|
where dest = ?
|
||||||
"""))
|
"""))
|
||||||
@ -89,7 +89,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
|
|||||||
ps.setString(1, domain.toString());
|
ps.setString(1, domain.toString());
|
||||||
var rs = ps.executeQuery();
|
var rs = ps.executeQuery();
|
||||||
while (rs.next()) {
|
while (rs.next()) {
|
||||||
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source")));
|
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getInt("cnt")));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
|
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
|
||||||
@ -102,7 +102,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
|
|||||||
String url = rs.getString("url");
|
String url = rs.getString("url");
|
||||||
url = aliasDomain + url.substring(url.indexOf('/'));
|
url = aliasDomain + url.substring(url.indexOf('/'));
|
||||||
|
|
||||||
links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source")));
|
links.add(new LinkWithText(url, rs.getString("text"), rs.getInt("cnt")));
|
||||||
}
|
}
|
||||||
return new DomainLinks(links);
|
return new DomainLinks(links);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.keyword;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import gnu.trove.list.TIntList;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.keyword.extractors.*;
|
import nu.marginalia.keyword.extractors.*;
|
||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
@ -17,6 +18,9 @@ import java.util.Comparator;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import static java.lang.Math.min;
|
||||||
|
import static java.lang.Math.sqrt;
|
||||||
|
|
||||||
public class DocumentKeywordExtractor {
|
public class DocumentKeywordExtractor {
|
||||||
|
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
@ -162,40 +166,60 @@ public class DocumentKeywordExtractor {
|
|||||||
recorder.reset();
|
recorder.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---
|
||||||
|
|
||||||
// Next add synthetic positions to the document for anchor texts
|
// Next add synthetic positions to the document for anchor texts
|
||||||
|
|
||||||
pos += 2; // add some padding to the end of the document before we start adding a-tag words
|
pos += 2; // add some padding to the end of the document before we start adding a-tag words
|
||||||
|
|
||||||
for (var linkText : linkTexts) {
|
|
||||||
|
|
||||||
for (var word : linkText) {
|
// Add
|
||||||
pos++;
|
|
||||||
|
List<DocumentSentence> sentences = linkTexts.linkTexts();
|
||||||
|
TIntList counts = linkTexts.counts();
|
||||||
|
SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
|
||||||
|
for (int i = 0; i < linkTexts.length(); i++) {
|
||||||
|
|
||||||
|
DocumentSentence sentence = sentences.get(i);
|
||||||
|
|
||||||
|
// We repeat a link sentence a number of times that is a function of how many times it's been spotted
|
||||||
|
// as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we
|
||||||
|
// attenuate that a bit with math so we don't generate a needlessly large positions list
|
||||||
|
|
||||||
|
final int repetitions = (int) min(sqrt(counts.get(i)), 12);
|
||||||
|
|
||||||
|
for (int ci = 0; ci < repetitions; ci++) {
|
||||||
|
|
||||||
|
for (var word : sentence) {
|
||||||
|
pos++;
|
||||||
|
|
||||||
|
extLinkRecorder.update(sentence, pos);
|
||||||
|
|
||||||
|
if (word.isStopWord()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String w = word.wordLowerCase();
|
||||||
|
if (matchesWordPattern(w)) {
|
||||||
|
/* Add information about term positions */
|
||||||
|
wordsBuilder.addPos(w, pos);
|
||||||
|
|
||||||
|
/* Add metadata for word */
|
||||||
|
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
||||||
|
}
|
||||||
|
|
||||||
for (var recorder : spanRecorders) {
|
|
||||||
recorder.update(linkText, pos);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (word.isStopWord()) {
|
// Add a break between sentences, to prevent them being registered as one long run-on sentence
|
||||||
continue;
|
extLinkRecorder.stop(pos + 1);
|
||||||
}
|
|
||||||
|
|
||||||
String w = word.wordLowerCase();
|
// Also add some positional padding between separate link texts so we don't match across their boundaries
|
||||||
if (matchesWordPattern(w)) {
|
pos += 2;
|
||||||
/* Add information about term positions */
|
|
||||||
wordsBuilder.addPos(w, pos);
|
|
||||||
|
|
||||||
/* Add metadata for word */
|
|
||||||
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// add some padding between separate link texts so we don't match across their boundaries
|
|
||||||
pos+=2;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var recorder : spanRecorders) {
|
wordsBuilder.addSpans(extLinkRecorder.finish(pos));
|
||||||
wordsBuilder.addSpans(recorder.finish(pos));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean matchesWordPattern(String s) {
|
boolean matchesWordPattern(String s) {
|
||||||
@ -265,6 +289,12 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void stop(int pos) {
|
||||||
|
if (start > 0) {
|
||||||
|
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||||
|
start = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
||||||
if (start > 0) {
|
if (start > 0) {
|
||||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
||||||
|
@ -1,14 +1,23 @@
|
|||||||
package nu.marginalia.keyword;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
|
import gnu.trove.list.TIntList;
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import nu.marginalia.language.model.DocumentSentence;
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public record LinkTexts(List<DocumentSentence> linkTexts) implements Iterable<DocumentSentence> {
|
public record LinkTexts(
|
||||||
|
List<DocumentSentence> linkTexts,
|
||||||
|
TIntList counts
|
||||||
|
) implements Iterable<DocumentSentence> {
|
||||||
public LinkTexts() {
|
public LinkTexts() {
|
||||||
this(List.of());
|
this(List.of(), new TIntArrayList());
|
||||||
|
}
|
||||||
|
|
||||||
|
public int length() {
|
||||||
|
return linkTexts.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
|
Loading…
Reference in New Issue
Block a user