diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java index 5ab5d166..bf077683 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java @@ -123,13 +123,13 @@ public class DocumentSpan { /** Returns true if for any position in the list, there exists a range * (position[i], position[i]+len] that is overlapped by a span */ - public boolean containsRangeExact(IntList positions, int len) { + public int containsRangeExact(IntList positions, int len) { if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) { - return false; + return 0; } int sei = 0; - + int cnt = 0; int start = startsEnds.getInt(sei++); int end = startsEnds.getInt(sei++); @@ -138,7 +138,15 @@ public class DocumentSpan { int position = positions.getInt(pi); if (position == start && position + len == end) { - return true; + cnt++; + if (sei + 2 <= startsEnds.size()) { + pi = 0; + start = startsEnds.getInt(sei++); + end = startsEnds.getInt(sei++); + } + else { + break; + } } else if (position < end) { pi++; @@ -147,11 +155,11 @@ public class DocumentSpan { end = startsEnds.getInt(sei++); } else { - return false; + break; } } - return false; + return cnt; } public int countRangeMatches(IntList positions, int len) { diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java index f0170883..a5085c25 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java @@ -115,16 +115,16 @@ class ForwardIndexSpansReaderTest { ) { var spans1 = reader.readSpans(arena, offset1); - assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(10), 2)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10), 2)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2)); - assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5)); - assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5)); - assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5)); + assertEquals(1, spans1.heading.containsRangeExact(IntList.of(10), 5)); + assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10), 5)); + assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(11), 5)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(9), 5)); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 74ad0e60..10f4ea05 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -388,11 +388,13 @@ public class IndexResultScoreCalculator { } var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT); - if (extLinkSpan.length() == fullGroup.size - && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size)) - { - score += 2; // Add additional bonus if there's a single-word atag span + if (extLinkSpan.length() >= fullGroup.size) { + int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size); + if (cnt > 0) { + score += 2 * cnt; + } } + return; } @@ -407,9 +409,9 @@ public class IndexResultScoreCalculator { // Bonus if there's a perfect match with an atag span var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT); - if (extLinkSpan.length() == fullGroup.size && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size)) - { - score += 2; + if (extLinkSpan.length() >= fullGroup.size) { + int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size); + score += 2*cnt; } // For optional groups, we scale the score by the size of the group relative to the full group @@ -420,7 +422,7 @@ public class IndexResultScoreCalculator { IntList intersections = optionalGroup.findIntersections(positions); for (var tag : HtmlTag.includedTags) { - int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);; + int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size); if (cnts > 0) { score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts))); } @@ -457,7 +459,7 @@ public class IndexResultScoreCalculator { case NAV -> 0.1f; case CODE -> 0.25f; case BODY -> 1.0f; - case EXTERNAL_LINKTEXT -> 0.75f; + case EXTERNAL_LINKTEXT -> 1.5f; default -> 0.0f; }; } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java index 2e0b6bd7..2ee65d25 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java @@ -1,6 +1,8 @@ package nu.marginalia.atags; import com.google.inject.Inject; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.Link; import nu.marginalia.keyword.LinkTexts; @@ -51,6 +53,7 @@ public class AnchorTextKeywords { List keywordsRaw = links.forUrl(url); List ret = new ArrayList<>(keywordsRaw.size()); + TIntList counts = new TIntArrayList(keywordsRaw.size()); // Extract and count keywords from anchor text for (Link keyword : keywordsRaw) { @@ -59,18 +62,20 @@ public class AnchorTextKeywords { var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); ret.add(sentence); + counts.add(keyword.count()); } - return new LinkTexts(ret); + return new LinkTexts(ret, counts); } public LinkTexts getAnchorTextKeywords(DomainLinks links, List urls) { List keywordsRaw = new ArrayList<>(); for (var url : urls) { - links.forUrl(url); + keywordsRaw.addAll(links.forUrl(url)); } List ret = new ArrayList<>(keywordsRaw.size()); + TIntList counts = new TIntArrayList(keywordsRaw.size()); // Extract and count keywords from anchor text for (Link keyword : keywordsRaw) { @@ -79,8 +84,9 @@ public class AnchorTextKeywords { var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); ret.add(sentence); + counts.add(keyword.count()); } - return new LinkTexts(ret); + return new LinkTexts(ret, counts); } } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java index 1c76469f..66d1e977 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java @@ -1,4 +1,4 @@ package nu.marginalia.atags.model; -public record Link(String source, String text) { +public record Link(String text, int count) { } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java index 784580fc..55986949 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java @@ -1,7 +1,7 @@ package nu.marginalia.atags.model; -public record LinkWithText(String url, String text, String source) { +public record LinkWithText(String url, String text, int cnt) { public Link toLink() { - return new Link(source, text); + return new Link(text, cnt); } } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java index c80a57c7..a15dfecd 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java @@ -80,7 +80,7 @@ public class AnchorTagsImpl implements AnchorTagsSource { select unnest(text) as 'text', unnest(url) as 'url', - unnest(source) as 'source' + unnest(cnt) as 'cnt' from atags where dest = ? """)) @@ -89,7 +89,7 @@ public class AnchorTagsImpl implements AnchorTagsSource { ps.setString(1, domain.toString()); var rs = ps.executeQuery(); while (rs.next()) { - links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source"))); + links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getInt("cnt"))); } // Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu? @@ -102,7 +102,7 @@ public class AnchorTagsImpl implements AnchorTagsSource { String url = rs.getString("url"); url = aliasDomain + url.substring(url.indexOf('/')); - links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source"))); + links.add(new LinkWithText(url, rs.getString("text"), rs.getInt("cnt"))); } return new DomainLinks(links); } diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index d0db9b7c..c4050d0a 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -5,35 +5,29 @@ import nu.marginalia.WmsaHome; import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.language.model.DocumentSentence; -import nu.marginalia.language.model.WordRep; -import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.model.EdgeUrl; import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; -import java.util.List; import java.util.stream.Stream; public class DocumentKeywordExtractor { - private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; + private final KeywordExtractor keywordExtractor = new KeywordExtractor(); + private final DocumentPositionMapper positionMapper = new DocumentPositionMapper(); @Inject public DocumentKeywordExtractor(TermFrequencyDict dict) { this.dict = dict; - this.keywordExtractor = new KeywordExtractor(); } // for tests public DocumentKeywordExtractor() { try { this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels()); - this.keywordExtractor = new KeywordExtractor(); } catch (Exception ex) { throw new RuntimeException(ex); @@ -60,7 +54,7 @@ public class DocumentKeywordExtractor { DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder(); - createSimpleWords(wordsBuilder, keywordMetadata, dld, linkTexts); + positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts); createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords); createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords); @@ -106,176 +100,4 @@ public class DocumentKeywordExtractor { } } - private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder, - KeywordMetadata metadata, - DocumentLanguageData dld, - LinkTexts linkTexts) - { - // we use 1-based indexing since the data - // will be gamma encoded, and it can't represent 0 - int pos = 0; - - List spanRecorders = new ArrayList<>(); - for (var htmlTag : HtmlTag.includedTags) { - if (!htmlTag.exclude) { - spanRecorders.add(new SpanRecorder(htmlTag)); - } - } - - for (DocumentSentence sent : dld) { - for (var word : sent) { - pos++; - - for (var recorder : spanRecorders) { - recorder.update(sent, pos); - } - - if (word.isStopWord()) { - continue; - } - - String w = word.wordLowerCase(); - if (matchesWordPattern(w)) { - /* Add information about term positions */ - wordsBuilder.addPos(w, pos); - - /* Add metadata for word */ - wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); - } - } - - for (var names : keywordExtractor.getProperNames(sent)) { - var rep = new WordRep(sent, names); - - byte meta = metadata.getMetadataForWord(rep.stemmed); - - wordsBuilder.addMeta(rep.word, meta); - } - } - - pos++; // we need to add one more position to account for the last word in the document - - for (var recorder : spanRecorders) { - wordsBuilder.addSpans(recorder.finish(pos)); - - // reset the recorder, so we can use it again without adding the same positions twice - recorder.reset(); - } - - // Next add synthetic positions to the document for anchor texts - - pos += 2; // add some padding to the end of the document before we start adding a-tag words - - for (var linkText : linkTexts) { - - for (var word : linkText) { - pos++; - - for (var recorder : spanRecorders) { - recorder.update(linkText, pos); - } - - if (word.isStopWord()) { - continue; - } - - String w = word.wordLowerCase(); - if (matchesWordPattern(w)) { - /* Add information about term positions */ - wordsBuilder.addPos(w, pos); - - /* Add metadata for word */ - wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); - } - } - - // add some padding between separate link texts so we don't match across their boundaries - pos+=2; - } - - for (var recorder : spanRecorders) { - wordsBuilder.addSpans(recorder.finish(pos)); - } - } - - boolean matchesWordPattern(String s) { - // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4} - - String wordPartSeparator = ".-_/:+*"; - - int i = 0; - - for (int run = 0; run < 15 && i < s.length(); run++, i++) { - char c = s.charAt(i); - if (c >= 'a' && c <= 'z') continue; - if (c >= 'A' && c <= 'Z') continue; - if (c >= '0' && c <= '9') continue; - break; - } - - if (i == 0) - return false; - - for (int j = 0; j < 5; j++) { - if (i == s.length()) return true; - - if (wordPartSeparator.indexOf(s.charAt(i)) < 0) { - return false; - } - - i++; - - for (int run = 0; run < 10 && i < s.length(); run++, i++) { - char c = s.charAt(i); - if (c >= 'a' && c <= 'z') continue; - if (c >= 'A' && c <= 'Z') continue; - if (c >= '0' && c <= '9') continue; - break; - } - } - - return false; - } - - /** Helper class to record spans of words */ - private static class SpanRecorder { - private List spans = new ArrayList<>(); - private final HtmlTag htmlTag; - private int start = 0; - - public SpanRecorder(HtmlTag htmlTag) { - this.htmlTag = htmlTag; - } - - public void update(DocumentSentence sentence, int pos) { - assert pos > 0; - - if ( - sentence.htmlTags.contains(htmlTag) - || (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) // special case for body tag, we match against no tag on the sentence - ) - { - if (start <= 0) start = pos; - } - else { - if (start > 0) { - spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); - start = 0; - } - } - } - - public List finish(int length) { - if (start > 0) { - spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); - start = 0; - } - return spans; - } - - public void reset() { - spans.clear(); - start = 0; - } - } } diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java new file mode 100644 index 00000000..0644cf76 --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java @@ -0,0 +1,237 @@ +package nu.marginalia.keyword; + +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.sentence.tag.HtmlTag; + +import java.util.ArrayList; +import java.util.List; + +import static java.lang.Math.min; +import static java.lang.Math.sqrt; + +/** DocumentPositionMapper is responsible for assigning keywords positions in the document, + * as well as recording spans of positions + */ +public class DocumentPositionMapper { + + private final KeywordExtractor keywordExtractor = new KeywordExtractor(); + + public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + DocumentLanguageData dld, + LinkTexts linkTexts) + { + + // First map the words in the documnent to their positions + int pos = mapDocumentPositions(wordsBuilder, metadata, dld); + + // Next create some padding space to avoid cross-matching + pos += 2; + + // Finally allocate some virtual space after the end of the document + // for the link texts, so that we can match against them as well, although + // these will be given a different span type. + mapLinkTextPositions(pos, wordsBuilder, metadata, linkTexts); + } + + + int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + DocumentLanguageData dld) + + { + + List spanRecorders = new ArrayList<>(); + for (var htmlTag : HtmlTag.includedTags) { + if (!htmlTag.exclude) { + spanRecorders.add(new SpanRecorder(htmlTag)); + } + } + + // we use 1-based indexing since the data + // will be gamma encoded, and it can't represent 0; + // but the loop starts by incrementing the position, + // so while unintuitive, zero is correct here. + int pos = 0; + + for (DocumentSentence sent : dld) { + for (var word : sent) { + pos++; + + // Update span position tracking + for (var recorder : spanRecorders) { + recorder.update(sent, pos); + } + + if (word.isStopWord()) { + continue; + } + + String w = word.wordLowerCase(); + if (matchesWordPattern(w)) { + /* Add information about term positions */ + wordsBuilder.addPos(w, pos); + + /* Add metadata for word */ + wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); + } + } + + for (var names : keywordExtractor.getProperNames(sent)) { + WordRep rep = new WordRep(sent, names); + byte meta = metadata.getMetadataForWord(rep.stemmed); + + wordsBuilder.addMeta(rep.word, meta); + } + } + + pos++; // we need to add one more position to account for the last word in the document + + for (var recorder : spanRecorders) { + wordsBuilder.addSpans(recorder.finish(pos)); + } + + return pos; + } + + void mapLinkTextPositions(int startPos, + DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + LinkTexts linkTexts) + { + int pos = startPos; + + SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT); + + LinkTexts.Iter iter = linkTexts.iterator(); + + while (iter.next()) { + + DocumentSentence sentence = iter.sentence(); + int count = iter.count(); + + // We repeat a link sentence a number of times that is a function of how many times it's been spotted + // as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we + // attenuate that a bit with math so we don't generate a needlessly large positions list + + final int repetitions = (int) Math.max(1, min(sqrt(count), 12)); + + for (int ci = 0; ci < repetitions; ci++) { + + for (var word : sentence) { + pos++; + + extLinkRecorder.update(sentence, pos); + + if (word.isStopWord()) { + continue; + } + + String w = word.wordLowerCase(); + if (matchesWordPattern(w)) { + /* Add information about term positions */ + wordsBuilder.addPos(w, pos); + + /* Add metadata for word */ + wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); + } + } + + // Add a break between sentences, to prevent them being registered as one long run-on sentence + extLinkRecorder.endCurrentSpan(pos + 1); + + // Also add some positional padding between separate link texts so we don't match across their boundaries + pos += 2; + } + } + + wordsBuilder.addSpans(extLinkRecorder.finish(pos)); + } + + boolean matchesWordPattern(String s) { + // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4} + + String wordPartSeparator = ".-_/:+*"; + + int i = 0; + + for (int run = 0; run < 15 && i < s.length(); run++, i++) { + char c = s.charAt(i); + if (c >= 'a' && c <= 'z') continue; + if (c >= 'A' && c <= 'Z') continue; + if (c >= '0' && c <= '9') continue; + break; + } + + if (i == 0) + return false; + + for (int j = 0; j < 5; j++) { + if (i == s.length()) return true; + + if (wordPartSeparator.indexOf(s.charAt(i)) < 0) { + return false; + } + + i++; + + for (int run = 0; run < 10 && i < s.length(); run++, i++) { + char c = s.charAt(i); + if (c >= 'a' && c <= 'z') continue; + if (c >= 'A' && c <= 'Z') continue; + if (c >= '0' && c <= '9') continue; + break; + } + } + + return false; + } + + /** Helper class to record spans of words */ + private static class SpanRecorder { + private final List spans = new ArrayList<>(); + private final HtmlTag htmlTag; + private int start = 0; + + public SpanRecorder(HtmlTag htmlTag) { + this.htmlTag = htmlTag; + } + + public void update(DocumentSentence sentence, int pos) { + assert pos > 0; + + if (sentence.htmlTags.contains(htmlTag)) { + if (start <= 0) start = pos; + } + else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) + { + // special case for body tag, we match against no tag on the sentence + if (start <= 0) start = pos; + } + else { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); + start = 0; + } + } + } + + public void endCurrentSpan(int pos) { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); + start = 0; + } + } + + public List finish(int length) { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); + start = 0; + } + return spans; + } + } +} diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java index 021bbbb0..1b1e5571 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java @@ -6,7 +6,7 @@ import nu.marginalia.keyword.extractors.TitleKeywords; import nu.marginalia.keyword.extractors.UrlKeywords; import nu.marginalia.model.idx.WordFlags; -class KeywordMetadata { +public class KeywordMetadata { private final TitleKeywords titleKeywords; private final NameLikeKeywords nameLikeKeywords; diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java index c1ade6b4..f2501930 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java @@ -1,19 +1,40 @@ package nu.marginalia.keyword; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; import nu.marginalia.language.model.DocumentSentence; import org.jetbrains.annotations.NotNull; -import java.util.Iterator; import java.util.List; -public record LinkTexts(List linkTexts) implements Iterable { +public record LinkTexts( + List linkTexts, + TIntList counts +) { public LinkTexts() { - this(List.of()); + this(List.of(), new TIntArrayList()); + } + + public int length() { + return linkTexts.size(); } @NotNull - @Override - public Iterator iterator() { - return linkTexts.iterator(); + public LinkTexts.Iter iterator() { + return new Iter(); + } + + public class Iter { + private int pos = -1; + + public boolean next() { + return ++pos < length(); + } + public int count() { + return counts.get(pos); + } + public DocumentSentence sentence() { + return linkTexts.get(pos); + } } } diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 74a424ef..6d2a4df5 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -17,7 +17,7 @@ import java.util.*; public class DocumentKeywordsBuilder { public final Object2ByteOpenHashMap wordToMeta; public final HashMap wordToPos; - public final Map> wordSpans = new HashMap<>(); + public final Map> wordSpans = new HashMap<>(); /** * These ware keywords that had signals of high relevance @@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder { positionsForTag.add(span.end()); } - spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag))); + spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag))); }); return new DocumentKeywords(wordArray, meta.toArray(), positions, spans); @@ -128,7 +128,7 @@ public class DocumentKeywordsBuilder { public void addSpans(List newSpans) { for (var span : newSpans) { - wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span); + wordSpans.computeIfAbsent(span.tag(), k -> new ArrayList<>()).add(span); } } diff --git a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java index 83996e41..5f25f8ed 100644 --- a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java +++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -25,21 +25,6 @@ class DocumentKeywordExtractorTest { static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(); static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - @Test - public void testWordPattern() { - Assertions.assertTrue(extractor.matchesWordPattern("test")); - Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde")); - Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef")); - - Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test")); - Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test")); - Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24")); - Assertions.assertTrue(extractor.matchesWordPattern("std::vector")); - Assertions.assertTrue(extractor.matchesWordPattern("c++")); - Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h")); - Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse")); - } - @Test public void testKeyboards2() throws IOException, URISyntaxException { var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), diff --git a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java new file mode 100644 index 00000000..a00dd3ae --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java @@ -0,0 +1,184 @@ +package nu.marginalia.keyword; + +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.WmsaHome; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class DocumentPositionMapperTest { + private final DocumentPositionMapper positionMapper = new DocumentPositionMapper(); + static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + + @Test + public void testWordPattern() { + Assertions.assertTrue(positionMapper.matchesWordPattern("test")); + Assertions.assertTrue(positionMapper.matchesWordPattern("1234567890abcde")); + Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef")); + + Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test")); + Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test")); + Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24")); + Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector")); + Assertions.assertTrue(positionMapper.matchesWordPattern("c++")); + Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h")); + Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse")); + } + + @Test + public void testBasic() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + DocumentLanguageData dld = new DocumentLanguageData( + se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)), + "I am a teapot" + ); + + int pos = positionMapper.mapDocumentPositions(keywordsBuilder, Mockito.mock(KeywordMetadata.class), dld); + + assertEquals(8, pos); + assertEquals(IntList.of(1), keywordsBuilder.wordToPos.get("i")); + assertEquals(IntList.of(2), keywordsBuilder.wordToPos.get("am")); + assertEquals(IntList.of(3), keywordsBuilder.wordToPos.get("a")); + assertEquals(IntList.of(4), keywordsBuilder.wordToPos.get("teapot")); + assertEquals(IntList.of(5), keywordsBuilder.wordToPos.get("short")); + assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("and")); + assertEquals(IntList.of(7), keywordsBuilder.wordToPos.get("stout")); + + var codeSpans = keywordsBuilder.wordSpans.get(HtmlTag.CODE); + assertEquals(1, codeSpans.size()); + var codeSpan = codeSpans.getFirst(); + + assertEquals(1, codeSpan.start()); + assertEquals(8, codeSpan.end()); + } + + + @Test + public void testLinksSingleWord1Rep() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + + var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + assertEquals(1, sentences.size()); + TIntList counts = new TIntArrayList(new int[] { 1 }); + + positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class), + new LinkTexts(sentences, counts)); + + assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda")); + + var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT); + assertEquals(1, linkTextSpans.size()); + var codeSpan = linkTextSpans.getFirst(); + + assertEquals(6, codeSpan.start()); + assertEquals(7, codeSpan.end()); + } + + @Test + public void testLinksSingleWord2Reps() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + + var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + assertEquals(1, sentences.size()); + TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts) + + positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class), + new LinkTexts(sentences, counts)); + + assertEquals(IntList.of(6, 9), keywordsBuilder.wordToPos.get("zelda")); + + var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT); + assertEquals(2, linkTextSpans.size()); + + DocumentKeywordsBuilder.DocumentWordSpan span; + span = linkTextSpans.get(0); + + assertEquals(6, span.start()); + assertEquals(7, span.end()); + + span = linkTextSpans.get(1); + + assertEquals(9, span.start()); + assertEquals(10, span.end()); + } + + @Test + public void testLinksTwoWords2Reps() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + + var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + assertEquals(1, sentences.size()); + TIntList counts = new TIntArrayList(new int[] { 4 }); + + positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class), + new LinkTexts(sentences, counts)); + + assertEquals(IntList.of(6, 10), keywordsBuilder.wordToPos.get("zelda")); + assertEquals(IntList.of(7, 11), keywordsBuilder.wordToPos.get("ii")); + + var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT); + assertEquals(2, linkTextSpans.size()); + + DocumentKeywordsBuilder.DocumentWordSpan span; + span = linkTextSpans.get(0); + + assertEquals(6, span.start()); + assertEquals(8, span.end()); + + span = linkTextSpans.get(1); + + assertEquals(10, span.start()); + assertEquals(12, span.end()); + } + + + @Test + public void testLinksTwoSent1Word1Rep() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + + var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + assertEquals(1, sentences1.size()); + assertEquals(1, sentences2.size()); + TIntList counts = new TIntArrayList(new int[] { 1, 1 }); + + List sentencesAll = new ArrayList<>(); + sentencesAll.addAll(sentences1); + sentencesAll.addAll(sentences2); + + positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class), + new LinkTexts(sentencesAll, counts)); + + assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda")); + assertEquals(IntList.of(9), keywordsBuilder.wordToPos.get("link")); + + var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT); + assertEquals(2, linkTextSpans.size()); + + DocumentKeywordsBuilder.DocumentWordSpan span; + span = linkTextSpans.get(0); + + assertEquals(6, span.start()); + assertEquals(7, span.end()); + + span = linkTextSpans.get(1); + + assertEquals(9, span.start()); + assertEquals(10, span.end()); + } + + +} \ No newline at end of file diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java index 537d6869..e7ceb519 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java @@ -75,7 +75,6 @@ public class DocumentLoaderService { public void accept(SlopDocumentRecord.MetadataProjection projection) { - long urlId = UrlIdCodec.encodeId( domainIdRegistry.getDomainId(projection.domain()), projection.ordinal() @@ -88,7 +87,7 @@ public class DocumentLoaderService { } try { - documentDbWriter.add(new DocdbUrlDetail( + details.add(new DocdbUrlDetail( urlId, parsedUrl.get(), projection.title(),