(keyword-extraction) Clean up code and add tests for position and spans calculation

This code has been a bit of a mess and historically significantly flaky, so some test coverage is more than overdue.
2025-02-24 05:18:58 +00:00 · 2024-12-08 14:14:52 +01:00 · 2024-12-08 14:14:52 +01:00 · e0c0ed27bc
commit e0c0ed27bc
parent 20abb91657
7 changed files with 445 additions and 235 deletions
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
@ -1,43 +1,33 @@
 package nu.marginalia.keyword;
 import com.google.inject.Inject;
 import gnu.trove.list.TIntList;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.keyword.extractors.*;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.language.model.DocumentLanguageData;
 import nu.marginalia.language.model.DocumentSentence;
 import nu.marginalia.language.model.WordRep;
 import nu.marginalia.language.sentence.tag.HtmlTag;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.term_frequency_dict.TermFrequencyDict;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Comparator;
 import java.util.List;
 import java.util.stream.Stream;
 import static java.lang.Math.min;
 import static java.lang.Math.sqrt;
 public class DocumentKeywordExtractor {
    private final KeywordExtractor keywordExtractor;
    private final TermFrequencyDict dict;
    private final KeywordExtractor keywordExtractor = new KeywordExtractor();
    private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
    @Inject
    public DocumentKeywordExtractor(TermFrequencyDict dict) {
        this.dict = dict;
        this.keywordExtractor = new KeywordExtractor();
    }
    // for tests
    public DocumentKeywordExtractor() {
        try {
            this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
            this.keywordExtractor = new KeywordExtractor();
        }
        catch (Exception ex) {
            throw new RuntimeException(ex);
@ -64,7 +54,7 @@ public class DocumentKeywordExtractor {
        DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
-        createSimpleWords(wordsBuilder, keywordMetadata, dld, linkTexts);
+        positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
        createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
        createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
@ -110,202 +100,4 @@ public class DocumentKeywordExtractor {
        }
    }
    private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
                                  KeywordMetadata metadata,
                                  DocumentLanguageData dld,
                                  LinkTexts linkTexts)
    {
        // we use 1-based indexing since the data
        // will be gamma encoded, and it can't represent 0
        int pos = 0;
        List<SpanRecorder> spanRecorders = new ArrayList<>();
        for (var htmlTag : HtmlTag.includedTags) {
            if (!htmlTag.exclude) {
                spanRecorders.add(new SpanRecorder(htmlTag));
            }
        }
        for (DocumentSentence sent : dld) {
            for (var word : sent) {
                pos++;
                for (var recorder : spanRecorders) {
                    recorder.update(sent, pos);
                }
                if (word.isStopWord()) {
                    continue;
                }
                String w = word.wordLowerCase();
                if (matchesWordPattern(w)) {
                    /* Add information about term positions */
                    wordsBuilder.addPos(w, pos);
                    /* Add metadata for word */
                    wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
                }
            }
            for (var names : keywordExtractor.getProperNames(sent)) {
                var rep = new WordRep(sent, names);
                byte meta = metadata.getMetadataForWord(rep.stemmed);
                wordsBuilder.addMeta(rep.word, meta);
            }
        }
        pos++; // we need to add one more position to account for the last word in the document
        for (var recorder : spanRecorders) {
            wordsBuilder.addSpans(recorder.finish(pos));
            // reset the recorder, so we can use it again without adding the same positions twice
            recorder.reset();
        }
        // ---
        // Next add synthetic positions to the document for anchor texts
        pos += 2; // add some padding to the end of the document before we start adding a-tag words
        // Add
        List<DocumentSentence> sentences = linkTexts.linkTexts();
        TIntList counts = linkTexts.counts();
        SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
        for (int i = 0; i < linkTexts.length(); i++) {
            DocumentSentence sentence = sentences.get(i);
            // We repeat a link sentence a number of times that is a function of how many times it's been spotted
            // as a link text.  A really "big" link typically has hundreds, if not thousands of repetitions, so we
            // attenuate that a bit with math so we don't generate a needlessly large positions list
            final int repetitions = (int) min(sqrt(counts.get(i)), 12);
            for (int ci = 0; ci < repetitions; ci++) {
                for (var word : sentence) {
                    pos++;
                    extLinkRecorder.update(sentence, pos);
                    if (word.isStopWord()) {
                        continue;
                    }
                    String w = word.wordLowerCase();
                    if (matchesWordPattern(w)) {
                        /* Add information about term positions */
                        wordsBuilder.addPos(w, pos);
                        /* Add metadata for word */
                        wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
                    }
                }
                // Add a break between sentences, to prevent them being registered as one long run-on sentence
                extLinkRecorder.stop(pos + 1);
                // Also add some positional padding between separate link texts so we don't match across their boundaries
                pos += 2;
            }
        }
        wordsBuilder.addSpans(extLinkRecorder.finish(pos));
    }
    boolean matchesWordPattern(String s) {
        // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
        String wordPartSeparator = ".-_/:+*";
        int i = 0;
        for (int run = 0; run < 15 && i < s.length(); run++, i++) {
            char c = s.charAt(i);
            if (c >= 'a' && c <= 'z') continue;
            if (c >= 'A' && c <= 'Z') continue;
            if (c >= '0' && c <= '9') continue;
            break;
        }
        if (i == 0)
            return false;
        for (int j = 0; j < 5; j++) {
            if (i == s.length()) return true;
            if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
                return false;
            }
            i++;
            for (int run = 0; run < 10 && i < s.length(); run++, i++) {
                char c = s.charAt(i);
                if (c >= 'a' && c <= 'z') continue;
                if (c >= 'A' && c <= 'Z') continue;
                if (c >= '0' && c <= '9') continue;
                break;
            }
        }
        return false;
    }
    /** Helper class to record spans of words */
    private static class SpanRecorder {
        private List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
        private final HtmlTag htmlTag;
        private int start = 0;
        public SpanRecorder(HtmlTag htmlTag) {
            this.htmlTag = htmlTag;
        }
        public void update(DocumentSentence sentence, int pos) {
            assert pos > 0;
            if (
                    sentence.htmlTags.contains(htmlTag)
                || (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)  // special case for body tag, we match against no tag on the sentence
            )
            {
                if (start <= 0) start = pos;
            }
            else {
                if (start > 0) {
                    spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
                    start = 0;
                }
            }
        }
        public void stop(int pos) {
            if (start > 0) {
                spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
                start = 0;
            }
        }
        public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
            if (start > 0) {
                spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
                start = 0;
            }
            return spans;
        }
        public void reset() {
            spans.clear();
            start = 0;
        }
    }
 }
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java
@ -0,0 +1,237 @@
 package nu.marginalia.keyword;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.language.model.DocumentLanguageData;
 import nu.marginalia.language.model.DocumentSentence;
 import nu.marginalia.language.model.WordRep;
 import nu.marginalia.language.sentence.tag.HtmlTag;
 import java.util.ArrayList;
 import java.util.List;
 import static java.lang.Math.min;
 import static java.lang.Math.sqrt;
 /** DocumentPositionMapper is responsible for assigning keywords positions in the document,
 * as well as recording spans of positions
 */
 public class DocumentPositionMapper {
    private final KeywordExtractor keywordExtractor = new KeywordExtractor();
    public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
                                                     KeywordMetadata metadata,
                                                     DocumentLanguageData dld,
                                                     LinkTexts linkTexts)
    {
        // First map the words in the documnent to their positions
        int pos = mapDocumentPositions(wordsBuilder, metadata, dld);
        // Next create some padding space to avoid cross-matching
        pos += 2;
        // Finally allocate some virtual space after the end of the document
        // for the link texts, so that we can match against them as well, although
        // these will be given a different span type.
        mapLinkTextPositions(pos, wordsBuilder, metadata, linkTexts);
    }
    int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
                                    KeywordMetadata metadata,
                                    DocumentLanguageData dld)
    {
        List<SpanRecorder> spanRecorders = new ArrayList<>();
        for (var htmlTag : HtmlTag.includedTags) {
            if (!htmlTag.exclude) {
                spanRecorders.add(new SpanRecorder(htmlTag));
            }
        }
        // we use 1-based indexing since the data
        // will be gamma encoded, and it can't represent 0;
        // but the loop starts by incrementing the position,
        // so while unintuitive, zero is correct here.
        int pos = 0;
        for (DocumentSentence sent : dld) {
            for (var word : sent) {
                pos++;
                // Update span position tracking
                for (var recorder : spanRecorders) {
                    recorder.update(sent, pos);
                }
                if (word.isStopWord()) {
                    continue;
                }
                String w = word.wordLowerCase();
                if (matchesWordPattern(w)) {
                    /* Add information about term positions */
                    wordsBuilder.addPos(w, pos);
                    /* Add metadata for word */
                    wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
                }
            }
            for (var names : keywordExtractor.getProperNames(sent)) {
                WordRep rep = new WordRep(sent, names);
                byte meta = metadata.getMetadataForWord(rep.stemmed);
                wordsBuilder.addMeta(rep.word, meta);
            }
        }
        pos++; // we need to add one more position to account for the last word in the document
        for (var recorder : spanRecorders) {
            wordsBuilder.addSpans(recorder.finish(pos));
        }
        return pos;
    }
    void mapLinkTextPositions(int startPos,
                              DocumentKeywordsBuilder wordsBuilder,
                              KeywordMetadata metadata,
                              LinkTexts linkTexts)
    {
        int pos = startPos;
        SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
        LinkTexts.Iter iter = linkTexts.iterator();
        while (iter.next()) {
            DocumentSentence sentence = iter.sentence();
            int count = iter.count();
            // We repeat a link sentence a number of times that is a function of how many times it's been spotted
            // as a link text.  A really "big" link typically has hundreds, if not thousands of repetitions, so we
            // attenuate that a bit with math so we don't generate a needlessly large positions list
            final int repetitions = (int) Math.max(1, min(sqrt(count), 12));
            for (int ci = 0; ci < repetitions; ci++) {
                for (var word : sentence) {
                    pos++;
                    extLinkRecorder.update(sentence, pos);
                    if (word.isStopWord()) {
                        continue;
                    }
                    String w = word.wordLowerCase();
                    if (matchesWordPattern(w)) {
                        /* Add information about term positions */
                        wordsBuilder.addPos(w, pos);
                        /* Add metadata for word */
                        wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
                    }
                }
                // Add a break between sentences, to prevent them being registered as one long run-on sentence
                extLinkRecorder.endCurrentSpan(pos + 1);
                // Also add some positional padding between separate link texts so we don't match across their boundaries
                pos += 2;
            }
        }
        wordsBuilder.addSpans(extLinkRecorder.finish(pos));
    }
    boolean matchesWordPattern(String s) {
        // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
        String wordPartSeparator = ".-_/:+*";
        int i = 0;
        for (int run = 0; run < 15 && i < s.length(); run++, i++) {
            char c = s.charAt(i);
            if (c >= 'a' && c <= 'z') continue;
            if (c >= 'A' && c <= 'Z') continue;
            if (c >= '0' && c <= '9') continue;
            break;
        }
        if (i == 0)
            return false;
        for (int j = 0; j < 5; j++) {
            if (i == s.length()) return true;
            if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
                return false;
            }
            i++;
            for (int run = 0; run < 10 && i < s.length(); run++, i++) {
                char c = s.charAt(i);
                if (c >= 'a' && c <= 'z') continue;
                if (c >= 'A' && c <= 'Z') continue;
                if (c >= '0' && c <= '9') continue;
                break;
            }
        }
        return false;
    }
    /** Helper class to record spans of words */
    private static class SpanRecorder {
        private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
        private final HtmlTag htmlTag;
        private int start = 0;
        public SpanRecorder(HtmlTag htmlTag) {
            this.htmlTag = htmlTag;
        }
        public void update(DocumentSentence sentence, int pos) {
            assert pos > 0;
            if (sentence.htmlTags.contains(htmlTag)) {
                if (start <= 0) start = pos;
            }
            else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
            {
                // special case for body tag, we match against no tag on the sentence
                if (start <= 0) start = pos;
            }
            else {
                if (start > 0) {
                    spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
                    start = 0;
                }
            }
        }
        public void endCurrentSpan(int pos) {
            if (start > 0) {
                spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
                start = 0;
            }
        }
        public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
            if (start > 0) {
                spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
                start = 0;
            }
            return spans;
        }
    }
 }
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java
@ -6,7 +6,7 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
 import nu.marginalia.keyword.extractors.UrlKeywords;
 import nu.marginalia.model.idx.WordFlags;
-class KeywordMetadata {
+public class KeywordMetadata {
    private final TitleKeywords titleKeywords;
    private final NameLikeKeywords nameLikeKeywords;
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java
@ -5,13 +5,12 @@ import gnu.trove.list.array.TIntArrayList;
 import nu.marginalia.language.model.DocumentSentence;
 import org.jetbrains.annotations.NotNull;
 import java.util.Iterator;
 import java.util.List;
 public record LinkTexts(
        List<DocumentSentence> linkTexts,
        TIntList counts
-) implements Iterable<DocumentSentence> {
+) {
    public LinkTexts() {
        this(List.of(), new TIntArrayList());
    }
@ -21,8 +20,21 @@ public record LinkTexts(
    }
    @NotNull
-    @Override
+    public LinkTexts.Iter iterator() {
-    public Iterator<DocumentSentence> iterator() {
+        return new Iter();
-        return linkTexts.iterator();
+    }
    public class Iter {
        private int pos = -1;
        public boolean next() {
            return ++pos < length();
        }
        public int count() {
            return counts.get(pos);
        }
        public DocumentSentence sentence() {
            return linkTexts.get(pos);
        }
    }
 }
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java
@ -17,7 +17,7 @@ import java.util.*;
 public class DocumentKeywordsBuilder {
    public final Object2ByteOpenHashMap<String> wordToMeta;
    public final HashMap<String, IntList> wordToPos;
-    public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>();
+    public final Map<HtmlTag, List<DocumentWordSpan>> wordSpans = new HashMap<>();
    /**
     * These ware keywords that had signals of high relevance
@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder {
                positionsForTag.add(span.end());
            }
-            spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag)));
+            spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
        });
        return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
@ -128,7 +128,7 @@ public class DocumentKeywordsBuilder {
    public void addSpans(List<DocumentWordSpan> newSpans) {
        for (var span : newSpans) {
-            wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span);
+            wordSpans.computeIfAbsent(span.tag(), k -> new ArrayList<>()).add(span);
        }
    }
--- a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java
+++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java
@ -25,21 +25,6 @@ class DocumentKeywordExtractorTest {
    static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
    static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
    @Test
    public void testWordPattern() {
        Assertions.assertTrue(extractor.matchesWordPattern("test"));
        Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
        Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
        Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
        Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
        Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
        Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
        Assertions.assertTrue(extractor.matchesWordPattern("c++"));
        Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
        Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
    }
    @Test
    public void testKeyboards2() throws IOException, URISyntaxException {
        var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
--- a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java
+++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java
@ -0,0 +1,184 @@
 package nu.marginalia.keyword;
 import gnu.trove.list.TIntList;
 import gnu.trove.list.array.TIntArrayList;
 import it.unimi.dsi.fastutil.ints.IntList;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.language.model.DocumentLanguageData;
 import nu.marginalia.language.model.DocumentSentence;
 import nu.marginalia.language.sentence.SentenceExtractor;
 import nu.marginalia.language.sentence.tag.HtmlTag;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 import org.mockito.Mockito;
 import java.util.ArrayList;
 import java.util.EnumSet;
 import java.util.List;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 class DocumentPositionMapperTest {
    private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
    static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
    @Test
    public void testWordPattern() {
        Assertions.assertTrue(positionMapper.matchesWordPattern("test"));
        Assertions.assertTrue(positionMapper.matchesWordPattern("1234567890abcde"));
        Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
        Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
        Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
        Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
        Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
        Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
        Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
        Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));
    }
    @Test
    public void testBasic() {
        DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
        DocumentLanguageData dld = new DocumentLanguageData(
                se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
                "I am a teapot"
        );
        int pos = positionMapper.mapDocumentPositions(keywordsBuilder, Mockito.mock(KeywordMetadata.class), dld);
        assertEquals(8, pos);
        assertEquals(IntList.of(1), keywordsBuilder.wordToPos.get("i"));
        assertEquals(IntList.of(2), keywordsBuilder.wordToPos.get("am"));
        assertEquals(IntList.of(3), keywordsBuilder.wordToPos.get("a"));
        assertEquals(IntList.of(4), keywordsBuilder.wordToPos.get("teapot"));
        assertEquals(IntList.of(5), keywordsBuilder.wordToPos.get("short"));
        assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("and"));
        assertEquals(IntList.of(7), keywordsBuilder.wordToPos.get("stout"));
        var codeSpans = keywordsBuilder.wordSpans.get(HtmlTag.CODE);
        assertEquals(1, codeSpans.size());
        var codeSpan = codeSpans.getFirst();
        assertEquals(1, codeSpan.start());
        assertEquals(8, codeSpan.end());
    }
    @Test
    public void testLinksSingleWord1Rep() {
        DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
        var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
        assertEquals(1, sentences.size());
        TIntList counts = new TIntArrayList(new int[] { 1 });
        positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
                new LinkTexts(sentences, counts));
        assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
        var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
        assertEquals(1, linkTextSpans.size());
        var codeSpan = linkTextSpans.getFirst();
        assertEquals(6, codeSpan.start());
        assertEquals(7, codeSpan.end());
    }
    @Test
    public void testLinksSingleWord2Reps() {
        DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
        var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
        assertEquals(1, sentences.size());
        TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
        positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
                new LinkTexts(sentences, counts));
        assertEquals(IntList.of(6, 9), keywordsBuilder.wordToPos.get("zelda"));
        var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
        assertEquals(2, linkTextSpans.size());
        DocumentKeywordsBuilder.DocumentWordSpan span;
        span = linkTextSpans.get(0);
        assertEquals(6, span.start());
        assertEquals(7, span.end());
        span = linkTextSpans.get(1);
        assertEquals(9, span.start());
        assertEquals(10, span.end());
    }
    @Test
    public void testLinksTwoWords2Reps() {
        DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
        var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
        assertEquals(1, sentences.size());
        TIntList counts = new TIntArrayList(new int[] { 4 });
        positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
                new LinkTexts(sentences, counts));
        assertEquals(IntList.of(6, 10), keywordsBuilder.wordToPos.get("zelda"));
        assertEquals(IntList.of(7, 11), keywordsBuilder.wordToPos.get("ii"));
        var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
        assertEquals(2, linkTextSpans.size());
        DocumentKeywordsBuilder.DocumentWordSpan span;
        span = linkTextSpans.get(0);
        assertEquals(6, span.start());
        assertEquals(8, span.end());
        span = linkTextSpans.get(1);
        assertEquals(10, span.start());
        assertEquals(12, span.end());
    }
    @Test
    public void testLinksTwoSent1Word1Rep() {
        DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
        var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
        var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
        assertEquals(1, sentences1.size());
        assertEquals(1, sentences2.size());
        TIntList counts = new TIntArrayList(new int[] { 1, 1 });
        List<DocumentSentence> sentencesAll = new ArrayList<>();
        sentencesAll.addAll(sentences1);
        sentencesAll.addAll(sentences2);
        positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
                new LinkTexts(sentencesAll, counts));
        assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
        assertEquals(IntList.of(9), keywordsBuilder.wordToPos.get("link"));
        var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
        assertEquals(2, linkTextSpans.size());
        DocumentKeywordsBuilder.DocumentWordSpan span;
        span = linkTextSpans.get(0);
        assertEquals(6, span.start());
        assertEquals(7, span.end());
        span = linkTextSpans.get(1);
        assertEquals(9, span.start());
        assertEquals(10, span.end());
    }
 }