mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(keyword-extraction) Clean up code and add tests for position and spans calculation
This code has been a bit of a mess and historically significantly flaky, so some test coverage is more than overdue.
This commit is contained in:
parent
20abb91657
commit
e0c0ed27bc
@ -1,43 +1,33 @@
|
|||||||
package nu.marginalia.keyword;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import gnu.trove.list.TIntList;
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.keyword.extractors.*;
|
import nu.marginalia.keyword.extractors.*;
|
||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.language.model.DocumentSentence;
|
|
||||||
import nu.marginalia.language.model.WordRep;
|
|
||||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import static java.lang.Math.min;
|
|
||||||
import static java.lang.Math.sqrt;
|
|
||||||
|
|
||||||
public class DocumentKeywordExtractor {
|
public class DocumentKeywordExtractor {
|
||||||
|
|
||||||
private final KeywordExtractor keywordExtractor;
|
|
||||||
private final TermFrequencyDict dict;
|
private final TermFrequencyDict dict;
|
||||||
|
|
||||||
|
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||||
|
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
this.keywordExtractor = new KeywordExtractor();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// for tests
|
// for tests
|
||||||
public DocumentKeywordExtractor() {
|
public DocumentKeywordExtractor() {
|
||||||
try {
|
try {
|
||||||
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||||
this.keywordExtractor = new KeywordExtractor();
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
throw new RuntimeException(ex);
|
throw new RuntimeException(ex);
|
||||||
@ -64,7 +54,7 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
createSimpleWords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||||
|
|
||||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||||
@ -110,202 +100,4 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
|
||||||
KeywordMetadata metadata,
|
|
||||||
DocumentLanguageData dld,
|
|
||||||
LinkTexts linkTexts)
|
|
||||||
{
|
|
||||||
// we use 1-based indexing since the data
|
|
||||||
// will be gamma encoded, and it can't represent 0
|
|
||||||
int pos = 0;
|
|
||||||
|
|
||||||
List<SpanRecorder> spanRecorders = new ArrayList<>();
|
|
||||||
for (var htmlTag : HtmlTag.includedTags) {
|
|
||||||
if (!htmlTag.exclude) {
|
|
||||||
spanRecorders.add(new SpanRecorder(htmlTag));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (DocumentSentence sent : dld) {
|
|
||||||
for (var word : sent) {
|
|
||||||
pos++;
|
|
||||||
|
|
||||||
for (var recorder : spanRecorders) {
|
|
||||||
recorder.update(sent, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (word.isStopWord()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
String w = word.wordLowerCase();
|
|
||||||
if (matchesWordPattern(w)) {
|
|
||||||
/* Add information about term positions */
|
|
||||||
wordsBuilder.addPos(w, pos);
|
|
||||||
|
|
||||||
/* Add metadata for word */
|
|
||||||
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
|
||||||
var rep = new WordRep(sent, names);
|
|
||||||
|
|
||||||
byte meta = metadata.getMetadataForWord(rep.stemmed);
|
|
||||||
|
|
||||||
wordsBuilder.addMeta(rep.word, meta);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pos++; // we need to add one more position to account for the last word in the document
|
|
||||||
|
|
||||||
for (var recorder : spanRecorders) {
|
|
||||||
wordsBuilder.addSpans(recorder.finish(pos));
|
|
||||||
|
|
||||||
// reset the recorder, so we can use it again without adding the same positions twice
|
|
||||||
recorder.reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---
|
|
||||||
|
|
||||||
// Next add synthetic positions to the document for anchor texts
|
|
||||||
|
|
||||||
pos += 2; // add some padding to the end of the document before we start adding a-tag words
|
|
||||||
|
|
||||||
|
|
||||||
// Add
|
|
||||||
|
|
||||||
List<DocumentSentence> sentences = linkTexts.linkTexts();
|
|
||||||
TIntList counts = linkTexts.counts();
|
|
||||||
SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
|
|
||||||
|
|
||||||
for (int i = 0; i < linkTexts.length(); i++) {
|
|
||||||
|
|
||||||
DocumentSentence sentence = sentences.get(i);
|
|
||||||
|
|
||||||
// We repeat a link sentence a number of times that is a function of how many times it's been spotted
|
|
||||||
// as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we
|
|
||||||
// attenuate that a bit with math so we don't generate a needlessly large positions list
|
|
||||||
|
|
||||||
final int repetitions = (int) min(sqrt(counts.get(i)), 12);
|
|
||||||
|
|
||||||
for (int ci = 0; ci < repetitions; ci++) {
|
|
||||||
|
|
||||||
for (var word : sentence) {
|
|
||||||
pos++;
|
|
||||||
|
|
||||||
extLinkRecorder.update(sentence, pos);
|
|
||||||
|
|
||||||
if (word.isStopWord()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
String w = word.wordLowerCase();
|
|
||||||
if (matchesWordPattern(w)) {
|
|
||||||
/* Add information about term positions */
|
|
||||||
wordsBuilder.addPos(w, pos);
|
|
||||||
|
|
||||||
/* Add metadata for word */
|
|
||||||
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add a break between sentences, to prevent them being registered as one long run-on sentence
|
|
||||||
extLinkRecorder.stop(pos + 1);
|
|
||||||
|
|
||||||
// Also add some positional padding between separate link texts so we don't match across their boundaries
|
|
||||||
pos += 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
wordsBuilder.addSpans(extLinkRecorder.finish(pos));
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean matchesWordPattern(String s) {
|
|
||||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
|
||||||
|
|
||||||
String wordPartSeparator = ".-_/:+*";
|
|
||||||
|
|
||||||
int i = 0;
|
|
||||||
|
|
||||||
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
|
||||||
char c = s.charAt(i);
|
|
||||||
if (c >= 'a' && c <= 'z') continue;
|
|
||||||
if (c >= 'A' && c <= 'Z') continue;
|
|
||||||
if (c >= '0' && c <= '9') continue;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i == 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (int j = 0; j < 5; j++) {
|
|
||||||
if (i == s.length()) return true;
|
|
||||||
|
|
||||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
i++;
|
|
||||||
|
|
||||||
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
|
||||||
char c = s.charAt(i);
|
|
||||||
if (c >= 'a' && c <= 'z') continue;
|
|
||||||
if (c >= 'A' && c <= 'Z') continue;
|
|
||||||
if (c >= '0' && c <= '9') continue;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Helper class to record spans of words */
|
|
||||||
private static class SpanRecorder {
|
|
||||||
private List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
|
|
||||||
private final HtmlTag htmlTag;
|
|
||||||
private int start = 0;
|
|
||||||
|
|
||||||
public SpanRecorder(HtmlTag htmlTag) {
|
|
||||||
this.htmlTag = htmlTag;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void update(DocumentSentence sentence, int pos) {
|
|
||||||
assert pos > 0;
|
|
||||||
|
|
||||||
if (
|
|
||||||
sentence.htmlTags.contains(htmlTag)
|
|
||||||
|| (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) // special case for body tag, we match against no tag on the sentence
|
|
||||||
)
|
|
||||||
{
|
|
||||||
if (start <= 0) start = pos;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (start > 0) {
|
|
||||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
|
||||||
start = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void stop(int pos) {
|
|
||||||
if (start > 0) {
|
|
||||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
|
||||||
start = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
|
||||||
if (start > 0) {
|
|
||||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
|
||||||
start = 0;
|
|
||||||
}
|
|
||||||
return spans;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void reset() {
|
|
||||||
spans.clear();
|
|
||||||
start = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,237 @@
|
|||||||
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
|
import nu.marginalia.language.model.WordRep;
|
||||||
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static java.lang.Math.min;
|
||||||
|
import static java.lang.Math.sqrt;
|
||||||
|
|
||||||
|
/** DocumentPositionMapper is responsible for assigning keywords positions in the document,
|
||||||
|
* as well as recording spans of positions
|
||||||
|
*/
|
||||||
|
public class DocumentPositionMapper {
|
||||||
|
|
||||||
|
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||||
|
|
||||||
|
public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
|
||||||
|
KeywordMetadata metadata,
|
||||||
|
DocumentLanguageData dld,
|
||||||
|
LinkTexts linkTexts)
|
||||||
|
{
|
||||||
|
|
||||||
|
// First map the words in the documnent to their positions
|
||||||
|
int pos = mapDocumentPositions(wordsBuilder, metadata, dld);
|
||||||
|
|
||||||
|
// Next create some padding space to avoid cross-matching
|
||||||
|
pos += 2;
|
||||||
|
|
||||||
|
// Finally allocate some virtual space after the end of the document
|
||||||
|
// for the link texts, so that we can match against them as well, although
|
||||||
|
// these will be given a different span type.
|
||||||
|
mapLinkTextPositions(pos, wordsBuilder, metadata, linkTexts);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||||
|
KeywordMetadata metadata,
|
||||||
|
DocumentLanguageData dld)
|
||||||
|
|
||||||
|
{
|
||||||
|
|
||||||
|
List<SpanRecorder> spanRecorders = new ArrayList<>();
|
||||||
|
for (var htmlTag : HtmlTag.includedTags) {
|
||||||
|
if (!htmlTag.exclude) {
|
||||||
|
spanRecorders.add(new SpanRecorder(htmlTag));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we use 1-based indexing since the data
|
||||||
|
// will be gamma encoded, and it can't represent 0;
|
||||||
|
// but the loop starts by incrementing the position,
|
||||||
|
// so while unintuitive, zero is correct here.
|
||||||
|
int pos = 0;
|
||||||
|
|
||||||
|
for (DocumentSentence sent : dld) {
|
||||||
|
for (var word : sent) {
|
||||||
|
pos++;
|
||||||
|
|
||||||
|
// Update span position tracking
|
||||||
|
for (var recorder : spanRecorders) {
|
||||||
|
recorder.update(sent, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (word.isStopWord()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String w = word.wordLowerCase();
|
||||||
|
if (matchesWordPattern(w)) {
|
||||||
|
/* Add information about term positions */
|
||||||
|
wordsBuilder.addPos(w, pos);
|
||||||
|
|
||||||
|
/* Add metadata for word */
|
||||||
|
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||||
|
WordRep rep = new WordRep(sent, names);
|
||||||
|
byte meta = metadata.getMetadataForWord(rep.stemmed);
|
||||||
|
|
||||||
|
wordsBuilder.addMeta(rep.word, meta);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pos++; // we need to add one more position to account for the last word in the document
|
||||||
|
|
||||||
|
for (var recorder : spanRecorders) {
|
||||||
|
wordsBuilder.addSpans(recorder.finish(pos));
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
void mapLinkTextPositions(int startPos,
|
||||||
|
DocumentKeywordsBuilder wordsBuilder,
|
||||||
|
KeywordMetadata metadata,
|
||||||
|
LinkTexts linkTexts)
|
||||||
|
{
|
||||||
|
int pos = startPos;
|
||||||
|
|
||||||
|
SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
|
||||||
|
LinkTexts.Iter iter = linkTexts.iterator();
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
|
||||||
|
DocumentSentence sentence = iter.sentence();
|
||||||
|
int count = iter.count();
|
||||||
|
|
||||||
|
// We repeat a link sentence a number of times that is a function of how many times it's been spotted
|
||||||
|
// as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we
|
||||||
|
// attenuate that a bit with math so we don't generate a needlessly large positions list
|
||||||
|
|
||||||
|
final int repetitions = (int) Math.max(1, min(sqrt(count), 12));
|
||||||
|
|
||||||
|
for (int ci = 0; ci < repetitions; ci++) {
|
||||||
|
|
||||||
|
for (var word : sentence) {
|
||||||
|
pos++;
|
||||||
|
|
||||||
|
extLinkRecorder.update(sentence, pos);
|
||||||
|
|
||||||
|
if (word.isStopWord()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String w = word.wordLowerCase();
|
||||||
|
if (matchesWordPattern(w)) {
|
||||||
|
/* Add information about term positions */
|
||||||
|
wordsBuilder.addPos(w, pos);
|
||||||
|
|
||||||
|
/* Add metadata for word */
|
||||||
|
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a break between sentences, to prevent them being registered as one long run-on sentence
|
||||||
|
extLinkRecorder.endCurrentSpan(pos + 1);
|
||||||
|
|
||||||
|
// Also add some positional padding between separate link texts so we don't match across their boundaries
|
||||||
|
pos += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
wordsBuilder.addSpans(extLinkRecorder.finish(pos));
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean matchesWordPattern(String s) {
|
||||||
|
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
||||||
|
|
||||||
|
String wordPartSeparator = ".-_/:+*";
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
||||||
|
char c = s.charAt(i);
|
||||||
|
if (c >= 'a' && c <= 'z') continue;
|
||||||
|
if (c >= 'A' && c <= 'Z') continue;
|
||||||
|
if (c >= '0' && c <= '9') continue;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (int j = 0; j < 5; j++) {
|
||||||
|
if (i == s.length()) return true;
|
||||||
|
|
||||||
|
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
i++;
|
||||||
|
|
||||||
|
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
||||||
|
char c = s.charAt(i);
|
||||||
|
if (c >= 'a' && c <= 'z') continue;
|
||||||
|
if (c >= 'A' && c <= 'Z') continue;
|
||||||
|
if (c >= '0' && c <= '9') continue;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Helper class to record spans of words */
|
||||||
|
private static class SpanRecorder {
|
||||||
|
private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
|
||||||
|
private final HtmlTag htmlTag;
|
||||||
|
private int start = 0;
|
||||||
|
|
||||||
|
public SpanRecorder(HtmlTag htmlTag) {
|
||||||
|
this.htmlTag = htmlTag;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update(DocumentSentence sentence, int pos) {
|
||||||
|
assert pos > 0;
|
||||||
|
|
||||||
|
if (sentence.htmlTags.contains(htmlTag)) {
|
||||||
|
if (start <= 0) start = pos;
|
||||||
|
}
|
||||||
|
else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
|
||||||
|
{
|
||||||
|
// special case for body tag, we match against no tag on the sentence
|
||||||
|
if (start <= 0) start = pos;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (start > 0) {
|
||||||
|
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||||
|
start = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void endCurrentSpan(int pos) {
|
||||||
|
if (start > 0) {
|
||||||
|
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||||
|
start = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
||||||
|
if (start > 0) {
|
||||||
|
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
||||||
|
start = 0;
|
||||||
|
}
|
||||||
|
return spans;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -6,7 +6,7 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
|
|||||||
import nu.marginalia.keyword.extractors.UrlKeywords;
|
import nu.marginalia.keyword.extractors.UrlKeywords;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
|
||||||
class KeywordMetadata {
|
public class KeywordMetadata {
|
||||||
|
|
||||||
private final TitleKeywords titleKeywords;
|
private final TitleKeywords titleKeywords;
|
||||||
private final NameLikeKeywords nameLikeKeywords;
|
private final NameLikeKeywords nameLikeKeywords;
|
||||||
|
@ -5,13 +5,12 @@ import gnu.trove.list.array.TIntArrayList;
|
|||||||
import nu.marginalia.language.model.DocumentSentence;
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public record LinkTexts(
|
public record LinkTexts(
|
||||||
List<DocumentSentence> linkTexts,
|
List<DocumentSentence> linkTexts,
|
||||||
TIntList counts
|
TIntList counts
|
||||||
) implements Iterable<DocumentSentence> {
|
) {
|
||||||
public LinkTexts() {
|
public LinkTexts() {
|
||||||
this(List.of(), new TIntArrayList());
|
this(List.of(), new TIntArrayList());
|
||||||
}
|
}
|
||||||
@ -21,8 +20,21 @@ public record LinkTexts(
|
|||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
@Override
|
public LinkTexts.Iter iterator() {
|
||||||
public Iterator<DocumentSentence> iterator() {
|
return new Iter();
|
||||||
return linkTexts.iterator();
|
}
|
||||||
|
|
||||||
|
public class Iter {
|
||||||
|
private int pos = -1;
|
||||||
|
|
||||||
|
public boolean next() {
|
||||||
|
return ++pos < length();
|
||||||
|
}
|
||||||
|
public int count() {
|
||||||
|
return counts.get(pos);
|
||||||
|
}
|
||||||
|
public DocumentSentence sentence() {
|
||||||
|
return linkTexts.get(pos);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,7 @@ import java.util.*;
|
|||||||
public class DocumentKeywordsBuilder {
|
public class DocumentKeywordsBuilder {
|
||||||
public final Object2ByteOpenHashMap<String> wordToMeta;
|
public final Object2ByteOpenHashMap<String> wordToMeta;
|
||||||
public final HashMap<String, IntList> wordToPos;
|
public final HashMap<String, IntList> wordToPos;
|
||||||
public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>();
|
public final Map<HtmlTag, List<DocumentWordSpan>> wordSpans = new HashMap<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* These ware keywords that had signals of high relevance
|
* These ware keywords that had signals of high relevance
|
||||||
@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder {
|
|||||||
positionsForTag.add(span.end());
|
positionsForTag.add(span.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag)));
|
spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
|
||||||
});
|
});
|
||||||
|
|
||||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
||||||
@ -128,7 +128,7 @@ public class DocumentKeywordsBuilder {
|
|||||||
|
|
||||||
public void addSpans(List<DocumentWordSpan> newSpans) {
|
public void addSpans(List<DocumentWordSpan> newSpans) {
|
||||||
for (var span : newSpans) {
|
for (var span : newSpans) {
|
||||||
wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span);
|
wordSpans.computeIfAbsent(span.tag(), k -> new ArrayList<>()).add(span);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,21 +25,6 @@ class DocumentKeywordExtractorTest {
|
|||||||
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
|
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
|
||||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testWordPattern() {
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("test"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
|
|
||||||
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
|
|
||||||
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
|
|
||||||
Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("c++"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
|
|
||||||
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testKeyboards2() throws IOException, URISyntaxException {
|
public void testKeyboards2() throws IOException, URISyntaxException {
|
||||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||||
|
@ -0,0 +1,184 @@
|
|||||||
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
|
import gnu.trove.list.TIntList;
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.EnumSet;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
class DocumentPositionMapperTest {
|
||||||
|
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||||
|
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWordPattern() {
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("test"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("1234567890abcde"));
|
||||||
|
Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
|
||||||
|
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
|
||||||
|
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
|
||||||
|
Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBasic() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
DocumentLanguageData dld = new DocumentLanguageData(
|
||||||
|
se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
|
||||||
|
"I am a teapot"
|
||||||
|
);
|
||||||
|
|
||||||
|
int pos = positionMapper.mapDocumentPositions(keywordsBuilder, Mockito.mock(KeywordMetadata.class), dld);
|
||||||
|
|
||||||
|
assertEquals(8, pos);
|
||||||
|
assertEquals(IntList.of(1), keywordsBuilder.wordToPos.get("i"));
|
||||||
|
assertEquals(IntList.of(2), keywordsBuilder.wordToPos.get("am"));
|
||||||
|
assertEquals(IntList.of(3), keywordsBuilder.wordToPos.get("a"));
|
||||||
|
assertEquals(IntList.of(4), keywordsBuilder.wordToPos.get("teapot"));
|
||||||
|
assertEquals(IntList.of(5), keywordsBuilder.wordToPos.get("short"));
|
||||||
|
assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("and"));
|
||||||
|
assertEquals(IntList.of(7), keywordsBuilder.wordToPos.get("stout"));
|
||||||
|
|
||||||
|
var codeSpans = keywordsBuilder.wordSpans.get(HtmlTag.CODE);
|
||||||
|
assertEquals(1, codeSpans.size());
|
||||||
|
var codeSpan = codeSpans.getFirst();
|
||||||
|
|
||||||
|
assertEquals(1, codeSpan.start());
|
||||||
|
assertEquals(8, codeSpan.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinksSingleWord1Rep() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
|
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
assertEquals(1, sentences.size());
|
||||||
|
TIntList counts = new TIntArrayList(new int[] { 1 });
|
||||||
|
|
||||||
|
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
|
||||||
|
new LinkTexts(sentences, counts));
|
||||||
|
|
||||||
|
assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
|
||||||
|
|
||||||
|
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
assertEquals(1, linkTextSpans.size());
|
||||||
|
var codeSpan = linkTextSpans.getFirst();
|
||||||
|
|
||||||
|
assertEquals(6, codeSpan.start());
|
||||||
|
assertEquals(7, codeSpan.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinksSingleWord2Reps() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
|
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
assertEquals(1, sentences.size());
|
||||||
|
TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
|
||||||
|
|
||||||
|
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
|
||||||
|
new LinkTexts(sentences, counts));
|
||||||
|
|
||||||
|
assertEquals(IntList.of(6, 9), keywordsBuilder.wordToPos.get("zelda"));
|
||||||
|
|
||||||
|
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
assertEquals(2, linkTextSpans.size());
|
||||||
|
|
||||||
|
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||||
|
span = linkTextSpans.get(0);
|
||||||
|
|
||||||
|
assertEquals(6, span.start());
|
||||||
|
assertEquals(7, span.end());
|
||||||
|
|
||||||
|
span = linkTextSpans.get(1);
|
||||||
|
|
||||||
|
assertEquals(9, span.start());
|
||||||
|
assertEquals(10, span.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinksTwoWords2Reps() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
|
var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
assertEquals(1, sentences.size());
|
||||||
|
TIntList counts = new TIntArrayList(new int[] { 4 });
|
||||||
|
|
||||||
|
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
|
||||||
|
new LinkTexts(sentences, counts));
|
||||||
|
|
||||||
|
assertEquals(IntList.of(6, 10), keywordsBuilder.wordToPos.get("zelda"));
|
||||||
|
assertEquals(IntList.of(7, 11), keywordsBuilder.wordToPos.get("ii"));
|
||||||
|
|
||||||
|
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
assertEquals(2, linkTextSpans.size());
|
||||||
|
|
||||||
|
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||||
|
span = linkTextSpans.get(0);
|
||||||
|
|
||||||
|
assertEquals(6, span.start());
|
||||||
|
assertEquals(8, span.end());
|
||||||
|
|
||||||
|
span = linkTextSpans.get(1);
|
||||||
|
|
||||||
|
assertEquals(10, span.start());
|
||||||
|
assertEquals(12, span.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinksTwoSent1Word1Rep() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
|
var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
assertEquals(1, sentences1.size());
|
||||||
|
assertEquals(1, sentences2.size());
|
||||||
|
TIntList counts = new TIntArrayList(new int[] { 1, 1 });
|
||||||
|
|
||||||
|
List<DocumentSentence> sentencesAll = new ArrayList<>();
|
||||||
|
sentencesAll.addAll(sentences1);
|
||||||
|
sentencesAll.addAll(sentences2);
|
||||||
|
|
||||||
|
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
|
||||||
|
new LinkTexts(sentencesAll, counts));
|
||||||
|
|
||||||
|
assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
|
||||||
|
assertEquals(IntList.of(9), keywordsBuilder.wordToPos.get("link"));
|
||||||
|
|
||||||
|
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
assertEquals(2, linkTextSpans.size());
|
||||||
|
|
||||||
|
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||||
|
span = linkTextSpans.get(0);
|
||||||
|
|
||||||
|
assertEquals(6, span.start());
|
||||||
|
assertEquals(7, span.end());
|
||||||
|
|
||||||
|
span = linkTextSpans.get(1);
|
||||||
|
|
||||||
|
assertEquals(9, span.start());
|
||||||
|
assertEquals(10, span.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user