Merge pull request #129 from MarginaliaSearch/atags-counts

(WIP) Improve atag sentence matching
This commit is contained in:
Viktor 2024-12-10 12:42:34 +00:00 committed by GitHub
commit 589f4dafb9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 504 additions and 240 deletions

View File

@ -123,13 +123,13 @@ public class DocumentSpan {
/** Returns true if for any position in the list, there exists a range
* (position[i], position[i]+len] that is overlapped by a span */
public boolean containsRangeExact(IntList positions, int len) {
public int containsRangeExact(IntList positions, int len) {
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
return false;
return 0;
}
int sei = 0;
int cnt = 0;
int start = startsEnds.getInt(sei++);
int end = startsEnds.getInt(sei++);
@ -138,7 +138,15 @@ public class DocumentSpan {
int position = positions.getInt(pi);
if (position == start && position + len == end) {
return true;
cnt++;
if (sei + 2 <= startsEnds.size()) {
pi = 0;
start = startsEnds.getInt(sei++);
end = startsEnds.getInt(sei++);
}
else {
break;
}
}
else if (position < end) {
pi++;
@ -147,11 +155,11 @@ public class DocumentSpan {
end = startsEnds.getInt(sei++);
}
else {
return false;
break;
}
}
return false;
return cnt;
}
public int countRangeMatches(IntList positions, int len) {

View File

@ -115,16 +115,16 @@ class ForwardIndexSpansReaderTest {
) {
var spans1 = reader.readSpans(arena, offset1);
assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2));
assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(10), 2));
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5));
assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
assertEquals(1, spans1.heading.containsRangeExact(IntList.of(10), 5));
assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5));
assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5));
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(11), 5));
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(9), 5));
}
}

View File

@ -388,11 +388,13 @@ public class IndexResultScoreCalculator {
}
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
if (extLinkSpan.length() == fullGroup.size
&& extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
{
score += 2; // Add additional bonus if there's a single-word atag span
if (extLinkSpan.length() >= fullGroup.size) {
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
if (cnt > 0) {
score += 2 * cnt;
}
}
return;
}
@ -407,9 +409,9 @@ public class IndexResultScoreCalculator {
// Bonus if there's a perfect match with an atag span
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
if (extLinkSpan.length() == fullGroup.size && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
{
score += 2;
if (extLinkSpan.length() >= fullGroup.size) {
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
score += 2*cnt;
}
// For optional groups, we scale the score by the size of the group relative to the full group
@ -420,7 +422,7 @@ public class IndexResultScoreCalculator {
IntList intersections = optionalGroup.findIntersections(positions);
for (var tag : HtmlTag.includedTags) {
int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);;
int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
if (cnts > 0) {
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
}
@ -457,7 +459,7 @@ public class IndexResultScoreCalculator {
case NAV -> 0.1f;
case CODE -> 0.25f;
case BODY -> 1.0f;
case EXTERNAL_LINKTEXT -> 0.75f;
case EXTERNAL_LINKTEXT -> 1.5f;
default -> 0.0f;
};
}

View File

@ -1,6 +1,8 @@
package nu.marginalia.atags;
import com.google.inject.Inject;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.model.Link;
import nu.marginalia.keyword.LinkTexts;
@ -51,6 +53,7 @@ public class AnchorTextKeywords {
List<Link> keywordsRaw = links.forUrl(url);
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
TIntList counts = new TIntArrayList(keywordsRaw.size());
// Extract and count keywords from anchor text
for (Link keyword : keywordsRaw) {
@ -59,18 +62,20 @@ public class AnchorTextKeywords {
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence);
counts.add(keyword.count());
}
return new LinkTexts(ret);
return new LinkTexts(ret, counts);
}
public LinkTexts getAnchorTextKeywords(DomainLinks links, List<EdgeUrl> urls) {
List<Link> keywordsRaw = new ArrayList<>();
for (var url : urls) {
links.forUrl(url);
keywordsRaw.addAll(links.forUrl(url));
}
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
TIntList counts = new TIntArrayList(keywordsRaw.size());
// Extract and count keywords from anchor text
for (Link keyword : keywordsRaw) {
@ -79,8 +84,9 @@ public class AnchorTextKeywords {
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence);
counts.add(keyword.count());
}
return new LinkTexts(ret);
return new LinkTexts(ret, counts);
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.atags.model;
public record Link(String source, String text) {
public record Link(String text, int count) {
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.atags.model;
public record LinkWithText(String url, String text, String source) {
public record LinkWithText(String url, String text, int cnt) {
public Link toLink() {
return new Link(source, text);
return new Link(text, cnt);
}
}

View File

@ -80,7 +80,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
select
unnest(text) as 'text',
unnest(url) as 'url',
unnest(source) as 'source'
unnest(cnt) as 'cnt'
from atags
where dest = ?
"""))
@ -89,7 +89,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
ps.setString(1, domain.toString());
var rs = ps.executeQuery();
while (rs.next()) {
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source")));
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getInt("cnt")));
}
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
@ -102,7 +102,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
String url = rs.getString("url");
url = aliasDomain + url.substring(url.indexOf('/'));
links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source")));
links.add(new LinkWithText(url, rs.getString("text"), rs.getInt("cnt")));
}
return new DomainLinks(links);
}

View File

@ -5,35 +5,29 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Stream;
public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor;
private final TermFrequencyDict dict;
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) {
this.dict = dict;
this.keywordExtractor = new KeywordExtractor();
}
// for tests
public DocumentKeywordExtractor() {
try {
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
this.keywordExtractor = new KeywordExtractor();
}
catch (Exception ex) {
throw new RuntimeException(ex);
@ -60,7 +54,7 @@ public class DocumentKeywordExtractor {
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
createSimpleWords(wordsBuilder, keywordMetadata, dld, linkTexts);
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
@ -106,176 +100,4 @@ public class DocumentKeywordExtractor {
}
}
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData dld,
LinkTexts linkTexts)
{
// we use 1-based indexing since the data
// will be gamma encoded, and it can't represent 0
int pos = 0;
List<SpanRecorder> spanRecorders = new ArrayList<>();
for (var htmlTag : HtmlTag.includedTags) {
if (!htmlTag.exclude) {
spanRecorders.add(new SpanRecorder(htmlTag));
}
}
for (DocumentSentence sent : dld) {
for (var word : sent) {
pos++;
for (var recorder : spanRecorders) {
recorder.update(sent, pos);
}
if (word.isStopWord()) {
continue;
}
String w = word.wordLowerCase();
if (matchesWordPattern(w)) {
/* Add information about term positions */
wordsBuilder.addPos(w, pos);
/* Add metadata for word */
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
byte meta = metadata.getMetadataForWord(rep.stemmed);
wordsBuilder.addMeta(rep.word, meta);
}
}
pos++; // we need to add one more position to account for the last word in the document
for (var recorder : spanRecorders) {
wordsBuilder.addSpans(recorder.finish(pos));
// reset the recorder, so we can use it again without adding the same positions twice
recorder.reset();
}
// Next add synthetic positions to the document for anchor texts
pos += 2; // add some padding to the end of the document before we start adding a-tag words
for (var linkText : linkTexts) {
for (var word : linkText) {
pos++;
for (var recorder : spanRecorders) {
recorder.update(linkText, pos);
}
if (word.isStopWord()) {
continue;
}
String w = word.wordLowerCase();
if (matchesWordPattern(w)) {
/* Add information about term positions */
wordsBuilder.addPos(w, pos);
/* Add metadata for word */
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
}
}
// add some padding between separate link texts so we don't match across their boundaries
pos+=2;
}
for (var recorder : spanRecorders) {
wordsBuilder.addSpans(recorder.finish(pos));
}
}
boolean matchesWordPattern(String s) {
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
String wordPartSeparator = ".-_/:+*";
int i = 0;
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
if (i == 0)
return false;
for (int j = 0; j < 5; j++) {
if (i == s.length()) return true;
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
return false;
}
i++;
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
}
return false;
}
/** Helper class to record spans of words */
private static class SpanRecorder {
private List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
private final HtmlTag htmlTag;
private int start = 0;
public SpanRecorder(HtmlTag htmlTag) {
this.htmlTag = htmlTag;
}
public void update(DocumentSentence sentence, int pos) {
assert pos > 0;
if (
sentence.htmlTags.contains(htmlTag)
|| (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) // special case for body tag, we match against no tag on the sentence
)
{
if (start <= 0) start = pos;
}
else {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
}
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
start = 0;
}
return spans;
}
public void reset() {
spans.clear();
start = 0;
}
}
}

View File

@ -0,0 +1,237 @@
package nu.marginalia.keyword;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.sentence.tag.HtmlTag;
import java.util.ArrayList;
import java.util.List;
import static java.lang.Math.min;
import static java.lang.Math.sqrt;
/** DocumentPositionMapper is responsible for assigning keywords positions in the document,
* as well as recording spans of positions
*/
public class DocumentPositionMapper {
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData dld,
LinkTexts linkTexts)
{
// First map the words in the documnent to their positions
int pos = mapDocumentPositions(wordsBuilder, metadata, dld);
// Next create some padding space to avoid cross-matching
pos += 2;
// Finally allocate some virtual space after the end of the document
// for the link texts, so that we can match against them as well, although
// these will be given a different span type.
mapLinkTextPositions(pos, wordsBuilder, metadata, linkTexts);
}
int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData dld)
{
List<SpanRecorder> spanRecorders = new ArrayList<>();
for (var htmlTag : HtmlTag.includedTags) {
if (!htmlTag.exclude) {
spanRecorders.add(new SpanRecorder(htmlTag));
}
}
// we use 1-based indexing since the data
// will be gamma encoded, and it can't represent 0;
// but the loop starts by incrementing the position,
// so while unintuitive, zero is correct here.
int pos = 0;
for (DocumentSentence sent : dld) {
for (var word : sent) {
pos++;
// Update span position tracking
for (var recorder : spanRecorders) {
recorder.update(sent, pos);
}
if (word.isStopWord()) {
continue;
}
String w = word.wordLowerCase();
if (matchesWordPattern(w)) {
/* Add information about term positions */
wordsBuilder.addPos(w, pos);
/* Add metadata for word */
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
WordRep rep = new WordRep(sent, names);
byte meta = metadata.getMetadataForWord(rep.stemmed);
wordsBuilder.addMeta(rep.word, meta);
}
}
pos++; // we need to add one more position to account for the last word in the document
for (var recorder : spanRecorders) {
wordsBuilder.addSpans(recorder.finish(pos));
}
return pos;
}
void mapLinkTextPositions(int startPos,
DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
LinkTexts linkTexts)
{
int pos = startPos;
SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
LinkTexts.Iter iter = linkTexts.iterator();
while (iter.next()) {
DocumentSentence sentence = iter.sentence();
int count = iter.count();
// We repeat a link sentence a number of times that is a function of how many times it's been spotted
// as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we
// attenuate that a bit with math so we don't generate a needlessly large positions list
final int repetitions = (int) Math.max(1, min(sqrt(count), 12));
for (int ci = 0; ci < repetitions; ci++) {
for (var word : sentence) {
pos++;
extLinkRecorder.update(sentence, pos);
if (word.isStopWord()) {
continue;
}
String w = word.wordLowerCase();
if (matchesWordPattern(w)) {
/* Add information about term positions */
wordsBuilder.addPos(w, pos);
/* Add metadata for word */
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
}
}
// Add a break between sentences, to prevent them being registered as one long run-on sentence
extLinkRecorder.endCurrentSpan(pos + 1);
// Also add some positional padding between separate link texts so we don't match across their boundaries
pos += 2;
}
}
wordsBuilder.addSpans(extLinkRecorder.finish(pos));
}
boolean matchesWordPattern(String s) {
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
String wordPartSeparator = ".-_/:+*";
int i = 0;
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
if (i == 0)
return false;
for (int j = 0; j < 5; j++) {
if (i == s.length()) return true;
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
return false;
}
i++;
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
}
return false;
}
/** Helper class to record spans of words */
private static class SpanRecorder {
private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
private final HtmlTag htmlTag;
private int start = 0;
public SpanRecorder(HtmlTag htmlTag) {
this.htmlTag = htmlTag;
}
public void update(DocumentSentence sentence, int pos) {
assert pos > 0;
if (sentence.htmlTags.contains(htmlTag)) {
if (start <= 0) start = pos;
}
else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
{
// special case for body tag, we match against no tag on the sentence
if (start <= 0) start = pos;
}
else {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
}
public void endCurrentSpan(int pos) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
start = 0;
}
return spans;
}
}
}

View File

@ -6,7 +6,7 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
import nu.marginalia.keyword.extractors.UrlKeywords;
import nu.marginalia.model.idx.WordFlags;
class KeywordMetadata {
public class KeywordMetadata {
private final TitleKeywords titleKeywords;
private final NameLikeKeywords nameLikeKeywords;

View File

@ -1,19 +1,40 @@
package nu.marginalia.keyword;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.language.model.DocumentSentence;
import org.jetbrains.annotations.NotNull;
import java.util.Iterator;
import java.util.List;
public record LinkTexts(List<DocumentSentence> linkTexts) implements Iterable<DocumentSentence> {
public record LinkTexts(
List<DocumentSentence> linkTexts,
TIntList counts
) {
public LinkTexts() {
this(List.of());
this(List.of(), new TIntArrayList());
}
public int length() {
return linkTexts.size();
}
@NotNull
@Override
public Iterator<DocumentSentence> iterator() {
return linkTexts.iterator();
public LinkTexts.Iter iterator() {
return new Iter();
}
public class Iter {
private int pos = -1;
public boolean next() {
return ++pos < length();
}
public int count() {
return counts.get(pos);
}
public DocumentSentence sentence() {
return linkTexts.get(pos);
}
}
}

View File

@ -17,7 +17,7 @@ import java.util.*;
public class DocumentKeywordsBuilder {
public final Object2ByteOpenHashMap<String> wordToMeta;
public final HashMap<String, IntList> wordToPos;
public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>();
public final Map<HtmlTag, List<DocumentWordSpan>> wordSpans = new HashMap<>();
/**
* These ware keywords that had signals of high relevance
@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder {
positionsForTag.add(span.end());
}
spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag)));
spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
});
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
@ -128,7 +128,7 @@ public class DocumentKeywordsBuilder {
public void addSpans(List<DocumentWordSpan> newSpans) {
for (var span : newSpans) {
wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span);
wordSpans.computeIfAbsent(span.tag(), k -> new ArrayList<>()).add(span);
}
}

View File

@ -25,21 +25,6 @@ class DocumentKeywordExtractorTest {
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
@Test
public void testWordPattern() {
Assertions.assertTrue(extractor.matchesWordPattern("test"));
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
Assertions.assertTrue(extractor.matchesWordPattern("c++"));
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
}
@Test
public void testKeyboards2() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),

View File

@ -0,0 +1,184 @@
package nu.marginalia.keyword;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
class DocumentPositionMapperTest {
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
@Test
public void testWordPattern() {
Assertions.assertTrue(positionMapper.matchesWordPattern("test"));
Assertions.assertTrue(positionMapper.matchesWordPattern("1234567890abcde"));
Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));
}
@Test
public void testBasic() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
DocumentLanguageData dld = new DocumentLanguageData(
se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
"I am a teapot"
);
int pos = positionMapper.mapDocumentPositions(keywordsBuilder, Mockito.mock(KeywordMetadata.class), dld);
assertEquals(8, pos);
assertEquals(IntList.of(1), keywordsBuilder.wordToPos.get("i"));
assertEquals(IntList.of(2), keywordsBuilder.wordToPos.get("am"));
assertEquals(IntList.of(3), keywordsBuilder.wordToPos.get("a"));
assertEquals(IntList.of(4), keywordsBuilder.wordToPos.get("teapot"));
assertEquals(IntList.of(5), keywordsBuilder.wordToPos.get("short"));
assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("and"));
assertEquals(IntList.of(7), keywordsBuilder.wordToPos.get("stout"));
var codeSpans = keywordsBuilder.wordSpans.get(HtmlTag.CODE);
assertEquals(1, codeSpans.size());
var codeSpan = codeSpans.getFirst();
assertEquals(1, codeSpan.start());
assertEquals(8, codeSpan.end());
}
@Test
public void testLinksSingleWord1Rep() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences.size());
TIntList counts = new TIntArrayList(new int[] { 1 });
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
new LinkTexts(sentences, counts));
assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(1, linkTextSpans.size());
var codeSpan = linkTextSpans.getFirst();
assertEquals(6, codeSpan.start());
assertEquals(7, codeSpan.end());
}
@Test
public void testLinksSingleWord2Reps() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences.size());
TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
new LinkTexts(sentences, counts));
assertEquals(IntList.of(6, 9), keywordsBuilder.wordToPos.get("zelda"));
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(2, linkTextSpans.size());
DocumentKeywordsBuilder.DocumentWordSpan span;
span = linkTextSpans.get(0);
assertEquals(6, span.start());
assertEquals(7, span.end());
span = linkTextSpans.get(1);
assertEquals(9, span.start());
assertEquals(10, span.end());
}
@Test
public void testLinksTwoWords2Reps() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences.size());
TIntList counts = new TIntArrayList(new int[] { 4 });
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
new LinkTexts(sentences, counts));
assertEquals(IntList.of(6, 10), keywordsBuilder.wordToPos.get("zelda"));
assertEquals(IntList.of(7, 11), keywordsBuilder.wordToPos.get("ii"));
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(2, linkTextSpans.size());
DocumentKeywordsBuilder.DocumentWordSpan span;
span = linkTextSpans.get(0);
assertEquals(6, span.start());
assertEquals(8, span.end());
span = linkTextSpans.get(1);
assertEquals(10, span.start());
assertEquals(12, span.end());
}
@Test
public void testLinksTwoSent1Word1Rep() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences1.size());
assertEquals(1, sentences2.size());
TIntList counts = new TIntArrayList(new int[] { 1, 1 });
List<DocumentSentence> sentencesAll = new ArrayList<>();
sentencesAll.addAll(sentences1);
sentencesAll.addAll(sentences2);
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
new LinkTexts(sentencesAll, counts));
assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
assertEquals(IntList.of(9), keywordsBuilder.wordToPos.get("link"));
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(2, linkTextSpans.size());
DocumentKeywordsBuilder.DocumentWordSpan span;
span = linkTextSpans.get(0);
assertEquals(6, span.start());
assertEquals(7, span.end());
span = linkTextSpans.get(1);
assertEquals(9, span.start());
assertEquals(10, span.end());
}
}

View File

@ -75,7 +75,6 @@ public class DocumentLoaderService {
public void accept(SlopDocumentRecord.MetadataProjection projection)
{
long urlId = UrlIdCodec.encodeId(
domainIdRegistry.getDomainId(projection.domain()),
projection.ordinal()
@ -88,7 +87,7 @@ public class DocumentLoaderService {
}
try {
documentDbWriter.add(new DocdbUrlDetail(
details.add(new DocdbUrlDetail(
urlId,
parsedUrl.get(),
projection.title(),