From 5c858a2b940f0341b2c5d49b0f489ea30caef44b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 6 Dec 2024 14:10:15 +0100 Subject: [PATCH 01/15] (experiment) Modify atags exporter to permit duplicates from different source domains This is an attempt to provide higher resolution term frequency data that will need evaluation when the data is processed. --- .../java/nu/marginalia/extractor/AtagExporter.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java index 9c66b882..6b602a61 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java @@ -155,12 +155,9 @@ public class AtagExporter implements ExporterIf { } // Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't - // need to be concerned about using the fast ASCII hash - if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) { - return false; - } - - return true; + // need to be concerned about using the fast ASCII hash. Note we don't consider the destination URL + // here, but the source domain instead. + return !hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(baseUrl.domain.toString())); } } From ee2d5496d01b6a212e5414f59dc3f66d58c59107 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 7 Dec 2024 14:01:50 +0100 Subject: [PATCH 02/15] Revert "(experiment) Modify atags exporter to permit duplicates from different source domains" This reverts commit 5c858a2b940f0341b2c5d49b0f489ea30caef44b. --- .../java/nu/marginalia/extractor/AtagExporter.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java index 6b602a61..9c66b882 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java @@ -155,9 +155,12 @@ public class AtagExporter implements ExporterIf { } // Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't - // need to be concerned about using the fast ASCII hash. Note we don't consider the destination URL - // here, but the source domain instead. - return !hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(baseUrl.domain.toString())); + // need to be concerned about using the fast ASCII hash + if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) { + return false; + } + + return true; } } From 291ca8daf1e44df0fb3f19080352e2ba3496a546 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 8 Dec 2024 00:27:11 +0100 Subject: [PATCH 03/15] (converter/index) Improve atag sentence matching by taking into consideration how many times a sentence appears in the links This change breaks the format of the atags.parquet file. --- .../index/forward/spans/DocumentSpan.java | 20 +++-- .../forward/ForwardIndexSpansReaderTest.java | 16 ++-- .../results/IndexResultScoreCalculator.java | 20 ++--- .../marginalia/atags/AnchorTextKeywords.java | 12 ++- .../java/nu/marginalia/atags/model/Link.java | 2 +- .../marginalia/atags/model/LinkWithText.java | 4 +- .../atags/source/AnchorTagsImpl.java | 6 +- .../keyword/DocumentKeywordExtractor.java | 74 +++++++++++++------ .../java/nu/marginalia/keyword/LinkTexts.java | 13 +++- 9 files changed, 111 insertions(+), 56 deletions(-) diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java index 5ab5d166..bf077683 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java @@ -123,13 +123,13 @@ public class DocumentSpan { /** Returns true if for any position in the list, there exists a range * (position[i], position[i]+len] that is overlapped by a span */ - public boolean containsRangeExact(IntList positions, int len) { + public int containsRangeExact(IntList positions, int len) { if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) { - return false; + return 0; } int sei = 0; - + int cnt = 0; int start = startsEnds.getInt(sei++); int end = startsEnds.getInt(sei++); @@ -138,7 +138,15 @@ public class DocumentSpan { int position = positions.getInt(pi); if (position == start && position + len == end) { - return true; + cnt++; + if (sei + 2 <= startsEnds.size()) { + pi = 0; + start = startsEnds.getInt(sei++); + end = startsEnds.getInt(sei++); + } + else { + break; + } } else if (position < end) { pi++; @@ -147,11 +155,11 @@ public class DocumentSpan { end = startsEnds.getInt(sei++); } else { - return false; + break; } } - return false; + return cnt; } public int countRangeMatches(IntList positions, int len) { diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java index f0170883..a5085c25 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java @@ -115,16 +115,16 @@ class ForwardIndexSpansReaderTest { ) { var spans1 = reader.readSpans(arena, offset1); - assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(10), 2)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10), 2)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2)); - assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5)); - assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5)); - assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5)); + assertEquals(1, spans1.heading.containsRangeExact(IntList.of(10), 5)); + assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10), 5)); + assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5)); - assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(11), 5)); + assertEquals(0, spans1.heading.containsRangeExact(IntList.of(9), 5)); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 74ad0e60..10f4ea05 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -388,11 +388,13 @@ public class IndexResultScoreCalculator { } var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT); - if (extLinkSpan.length() == fullGroup.size - && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size)) - { - score += 2; // Add additional bonus if there's a single-word atag span + if (extLinkSpan.length() >= fullGroup.size) { + int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size); + if (cnt > 0) { + score += 2 * cnt; + } } + return; } @@ -407,9 +409,9 @@ public class IndexResultScoreCalculator { // Bonus if there's a perfect match with an atag span var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT); - if (extLinkSpan.length() == fullGroup.size && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size)) - { - score += 2; + if (extLinkSpan.length() >= fullGroup.size) { + int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size); + score += 2*cnt; } // For optional groups, we scale the score by the size of the group relative to the full group @@ -420,7 +422,7 @@ public class IndexResultScoreCalculator { IntList intersections = optionalGroup.findIntersections(positions); for (var tag : HtmlTag.includedTags) { - int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);; + int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size); if (cnts > 0) { score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts))); } @@ -457,7 +459,7 @@ public class IndexResultScoreCalculator { case NAV -> 0.1f; case CODE -> 0.25f; case BODY -> 1.0f; - case EXTERNAL_LINKTEXT -> 0.75f; + case EXTERNAL_LINKTEXT -> 1.5f; default -> 0.0f; }; } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java index 2e0b6bd7..2ee65d25 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java @@ -1,6 +1,8 @@ package nu.marginalia.atags; import com.google.inject.Inject; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.Link; import nu.marginalia.keyword.LinkTexts; @@ -51,6 +53,7 @@ public class AnchorTextKeywords { List keywordsRaw = links.forUrl(url); List ret = new ArrayList<>(keywordsRaw.size()); + TIntList counts = new TIntArrayList(keywordsRaw.size()); // Extract and count keywords from anchor text for (Link keyword : keywordsRaw) { @@ -59,18 +62,20 @@ public class AnchorTextKeywords { var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); ret.add(sentence); + counts.add(keyword.count()); } - return new LinkTexts(ret); + return new LinkTexts(ret, counts); } public LinkTexts getAnchorTextKeywords(DomainLinks links, List urls) { List keywordsRaw = new ArrayList<>(); for (var url : urls) { - links.forUrl(url); + keywordsRaw.addAll(links.forUrl(url)); } List ret = new ArrayList<>(keywordsRaw.size()); + TIntList counts = new TIntArrayList(keywordsRaw.size()); // Extract and count keywords from anchor text for (Link keyword : keywordsRaw) { @@ -79,8 +84,9 @@ public class AnchorTextKeywords { var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); ret.add(sentence); + counts.add(keyword.count()); } - return new LinkTexts(ret); + return new LinkTexts(ret, counts); } } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java index 1c76469f..66d1e977 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java @@ -1,4 +1,4 @@ package nu.marginalia.atags.model; -public record Link(String source, String text) { +public record Link(String text, int count) { } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java index 784580fc..55986949 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java @@ -1,7 +1,7 @@ package nu.marginalia.atags.model; -public record LinkWithText(String url, String text, String source) { +public record LinkWithText(String url, String text, int cnt) { public Link toLink() { - return new Link(source, text); + return new Link(text, cnt); } } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java index c80a57c7..a15dfecd 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java @@ -80,7 +80,7 @@ public class AnchorTagsImpl implements AnchorTagsSource { select unnest(text) as 'text', unnest(url) as 'url', - unnest(source) as 'source' + unnest(cnt) as 'cnt' from atags where dest = ? """)) @@ -89,7 +89,7 @@ public class AnchorTagsImpl implements AnchorTagsSource { ps.setString(1, domain.toString()); var rs = ps.executeQuery(); while (rs.next()) { - links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source"))); + links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getInt("cnt"))); } // Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu? @@ -102,7 +102,7 @@ public class AnchorTagsImpl implements AnchorTagsSource { String url = rs.getString("url"); url = aliasDomain + url.substring(url.indexOf('/')); - links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source"))); + links.add(new LinkWithText(url, rs.getString("text"), rs.getInt("cnt"))); } return new DomainLinks(links); } diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index d0db9b7c..7d9eae69 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -1,6 +1,7 @@ package nu.marginalia.keyword; import com.google.inject.Inject; +import gnu.trove.list.TIntList; import nu.marginalia.WmsaHome; import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; @@ -17,6 +18,9 @@ import java.util.Comparator; import java.util.List; import java.util.stream.Stream; +import static java.lang.Math.min; +import static java.lang.Math.sqrt; + public class DocumentKeywordExtractor { private final KeywordExtractor keywordExtractor; @@ -162,40 +166,60 @@ public class DocumentKeywordExtractor { recorder.reset(); } + // --- + // Next add synthetic positions to the document for anchor texts pos += 2; // add some padding to the end of the document before we start adding a-tag words - for (var linkText : linkTexts) { - for (var word : linkText) { - pos++; + // Add + + List sentences = linkTexts.linkTexts(); + TIntList counts = linkTexts.counts(); + SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT); + + for (int i = 0; i < linkTexts.length(); i++) { + + DocumentSentence sentence = sentences.get(i); + + // We repeat a link sentence a number of times that is a function of how many times it's been spotted + // as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we + // attenuate that a bit with math so we don't generate a needlessly large positions list + + final int repetitions = (int) min(sqrt(counts.get(i)), 12); + + for (int ci = 0; ci < repetitions; ci++) { + + for (var word : sentence) { + pos++; + + extLinkRecorder.update(sentence, pos); + + if (word.isStopWord()) { + continue; + } + + String w = word.wordLowerCase(); + if (matchesWordPattern(w)) { + /* Add information about term positions */ + wordsBuilder.addPos(w, pos); + + /* Add metadata for word */ + wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); + } - for (var recorder : spanRecorders) { - recorder.update(linkText, pos); } - if (word.isStopWord()) { - continue; - } + // Add a break between sentences, to prevent them being registered as one long run-on sentence + extLinkRecorder.stop(pos + 1); - String w = word.wordLowerCase(); - if (matchesWordPattern(w)) { - /* Add information about term positions */ - wordsBuilder.addPos(w, pos); - - /* Add metadata for word */ - wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); - } + // Also add some positional padding between separate link texts so we don't match across their boundaries + pos += 2; } - - // add some padding between separate link texts so we don't match across their boundaries - pos+=2; } - for (var recorder : spanRecorders) { - wordsBuilder.addSpans(recorder.finish(pos)); - } + wordsBuilder.addSpans(extLinkRecorder.finish(pos)); } boolean matchesWordPattern(String s) { @@ -265,6 +289,12 @@ public class DocumentKeywordExtractor { } } + public void stop(int pos) { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); + start = 0; + } + } public List finish(int length) { if (start > 0) { spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java index c1ade6b4..0251c168 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java @@ -1,14 +1,23 @@ package nu.marginalia.keyword; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; import nu.marginalia.language.model.DocumentSentence; import org.jetbrains.annotations.NotNull; import java.util.Iterator; import java.util.List; -public record LinkTexts(List linkTexts) implements Iterable { +public record LinkTexts( + List linkTexts, + TIntList counts +) implements Iterable { public LinkTexts() { - this(List.of()); + this(List.of(), new TIntArrayList()); + } + + public int length() { + return linkTexts.size(); } @NotNull From 20abb9165770783d8424390a04a24db9a2b777a4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 8 Dec 2024 13:12:52 +0100 Subject: [PATCH 04/15] (loader) Correct DocumentLoaderService to properly do bulk inserts Fixes issue #128 --- .../nu/marginalia/loading/documents/DocumentLoaderService.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java index 537d6869..e7ceb519 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java @@ -75,7 +75,6 @@ public class DocumentLoaderService { public void accept(SlopDocumentRecord.MetadataProjection projection) { - long urlId = UrlIdCodec.encodeId( domainIdRegistry.getDomainId(projection.domain()), projection.ordinal() @@ -88,7 +87,7 @@ public class DocumentLoaderService { } try { - documentDbWriter.add(new DocdbUrlDetail( + details.add(new DocdbUrlDetail( urlId, parsedUrl.get(), projection.title(), From e0c0ed27bc2006d254a0b4dbbdc2cf203d9c53f5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 8 Dec 2024 14:14:52 +0100 Subject: [PATCH 05/15] (keyword-extraction) Clean up code and add tests for position and spans calculation This code has been a bit of a mess and historically significantly flaky, so some test coverage is more than overdue. --- .../keyword/DocumentKeywordExtractor.java | 214 +--------------- .../keyword/DocumentPositionMapper.java | 237 ++++++++++++++++++ .../marginalia/keyword/KeywordMetadata.java | 2 +- .../java/nu/marginalia/keyword/LinkTexts.java | 22 +- .../model/DocumentKeywordsBuilder.java | 6 +- .../keyword/DocumentKeywordExtractorTest.java | 15 -- .../keyword/DocumentPositionMapperTest.java | 184 ++++++++++++++ 7 files changed, 445 insertions(+), 235 deletions(-) create mode 100644 code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java create mode 100644 code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index 7d9eae69..c4050d0a 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -1,43 +1,33 @@ package nu.marginalia.keyword; import com.google.inject.Inject; -import gnu.trove.list.TIntList; import nu.marginalia.WmsaHome; import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.language.model.DocumentSentence; -import nu.marginalia.language.model.WordRep; -import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.model.EdgeUrl; import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; -import java.util.List; import java.util.stream.Stream; -import static java.lang.Math.min; -import static java.lang.Math.sqrt; - public class DocumentKeywordExtractor { - private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; + private final KeywordExtractor keywordExtractor = new KeywordExtractor(); + private final DocumentPositionMapper positionMapper = new DocumentPositionMapper(); @Inject public DocumentKeywordExtractor(TermFrequencyDict dict) { this.dict = dict; - this.keywordExtractor = new KeywordExtractor(); } // for tests public DocumentKeywordExtractor() { try { this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels()); - this.keywordExtractor = new KeywordExtractor(); } catch (Exception ex) { throw new RuntimeException(ex); @@ -64,7 +54,7 @@ public class DocumentKeywordExtractor { DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder(); - createSimpleWords(wordsBuilder, keywordMetadata, dld, linkTexts); + positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts); createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords); createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords); @@ -110,202 +100,4 @@ public class DocumentKeywordExtractor { } } - private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder, - KeywordMetadata metadata, - DocumentLanguageData dld, - LinkTexts linkTexts) - { - // we use 1-based indexing since the data - // will be gamma encoded, and it can't represent 0 - int pos = 0; - - List spanRecorders = new ArrayList<>(); - for (var htmlTag : HtmlTag.includedTags) { - if (!htmlTag.exclude) { - spanRecorders.add(new SpanRecorder(htmlTag)); - } - } - - for (DocumentSentence sent : dld) { - for (var word : sent) { - pos++; - - for (var recorder : spanRecorders) { - recorder.update(sent, pos); - } - - if (word.isStopWord()) { - continue; - } - - String w = word.wordLowerCase(); - if (matchesWordPattern(w)) { - /* Add information about term positions */ - wordsBuilder.addPos(w, pos); - - /* Add metadata for word */ - wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); - } - } - - for (var names : keywordExtractor.getProperNames(sent)) { - var rep = new WordRep(sent, names); - - byte meta = metadata.getMetadataForWord(rep.stemmed); - - wordsBuilder.addMeta(rep.word, meta); - } - } - - pos++; // we need to add one more position to account for the last word in the document - - for (var recorder : spanRecorders) { - wordsBuilder.addSpans(recorder.finish(pos)); - - // reset the recorder, so we can use it again without adding the same positions twice - recorder.reset(); - } - - // --- - - // Next add synthetic positions to the document for anchor texts - - pos += 2; // add some padding to the end of the document before we start adding a-tag words - - - // Add - - List sentences = linkTexts.linkTexts(); - TIntList counts = linkTexts.counts(); - SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT); - - for (int i = 0; i < linkTexts.length(); i++) { - - DocumentSentence sentence = sentences.get(i); - - // We repeat a link sentence a number of times that is a function of how many times it's been spotted - // as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we - // attenuate that a bit with math so we don't generate a needlessly large positions list - - final int repetitions = (int) min(sqrt(counts.get(i)), 12); - - for (int ci = 0; ci < repetitions; ci++) { - - for (var word : sentence) { - pos++; - - extLinkRecorder.update(sentence, pos); - - if (word.isStopWord()) { - continue; - } - - String w = word.wordLowerCase(); - if (matchesWordPattern(w)) { - /* Add information about term positions */ - wordsBuilder.addPos(w, pos); - - /* Add metadata for word */ - wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); - } - - } - - // Add a break between sentences, to prevent them being registered as one long run-on sentence - extLinkRecorder.stop(pos + 1); - - // Also add some positional padding between separate link texts so we don't match across their boundaries - pos += 2; - } - } - - wordsBuilder.addSpans(extLinkRecorder.finish(pos)); - } - - boolean matchesWordPattern(String s) { - // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4} - - String wordPartSeparator = ".-_/:+*"; - - int i = 0; - - for (int run = 0; run < 15 && i < s.length(); run++, i++) { - char c = s.charAt(i); - if (c >= 'a' && c <= 'z') continue; - if (c >= 'A' && c <= 'Z') continue; - if (c >= '0' && c <= '9') continue; - break; - } - - if (i == 0) - return false; - - for (int j = 0; j < 5; j++) { - if (i == s.length()) return true; - - if (wordPartSeparator.indexOf(s.charAt(i)) < 0) { - return false; - } - - i++; - - for (int run = 0; run < 10 && i < s.length(); run++, i++) { - char c = s.charAt(i); - if (c >= 'a' && c <= 'z') continue; - if (c >= 'A' && c <= 'Z') continue; - if (c >= '0' && c <= '9') continue; - break; - } - } - - return false; - } - - /** Helper class to record spans of words */ - private static class SpanRecorder { - private List spans = new ArrayList<>(); - private final HtmlTag htmlTag; - private int start = 0; - - public SpanRecorder(HtmlTag htmlTag) { - this.htmlTag = htmlTag; - } - - public void update(DocumentSentence sentence, int pos) { - assert pos > 0; - - if ( - sentence.htmlTags.contains(htmlTag) - || (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) // special case for body tag, we match against no tag on the sentence - ) - { - if (start <= 0) start = pos; - } - else { - if (start > 0) { - spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); - start = 0; - } - } - } - - public void stop(int pos) { - if (start > 0) { - spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); - start = 0; - } - } - public List finish(int length) { - if (start > 0) { - spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); - start = 0; - } - return spans; - } - - public void reset() { - spans.clear(); - start = 0; - } - } } diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java new file mode 100644 index 00000000..0644cf76 --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java @@ -0,0 +1,237 @@ +package nu.marginalia.keyword; + +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.sentence.tag.HtmlTag; + +import java.util.ArrayList; +import java.util.List; + +import static java.lang.Math.min; +import static java.lang.Math.sqrt; + +/** DocumentPositionMapper is responsible for assigning keywords positions in the document, + * as well as recording spans of positions + */ +public class DocumentPositionMapper { + + private final KeywordExtractor keywordExtractor = new KeywordExtractor(); + + public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + DocumentLanguageData dld, + LinkTexts linkTexts) + { + + // First map the words in the documnent to their positions + int pos = mapDocumentPositions(wordsBuilder, metadata, dld); + + // Next create some padding space to avoid cross-matching + pos += 2; + + // Finally allocate some virtual space after the end of the document + // for the link texts, so that we can match against them as well, although + // these will be given a different span type. + mapLinkTextPositions(pos, wordsBuilder, metadata, linkTexts); + } + + + int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + DocumentLanguageData dld) + + { + + List spanRecorders = new ArrayList<>(); + for (var htmlTag : HtmlTag.includedTags) { + if (!htmlTag.exclude) { + spanRecorders.add(new SpanRecorder(htmlTag)); + } + } + + // we use 1-based indexing since the data + // will be gamma encoded, and it can't represent 0; + // but the loop starts by incrementing the position, + // so while unintuitive, zero is correct here. + int pos = 0; + + for (DocumentSentence sent : dld) { + for (var word : sent) { + pos++; + + // Update span position tracking + for (var recorder : spanRecorders) { + recorder.update(sent, pos); + } + + if (word.isStopWord()) { + continue; + } + + String w = word.wordLowerCase(); + if (matchesWordPattern(w)) { + /* Add information about term positions */ + wordsBuilder.addPos(w, pos); + + /* Add metadata for word */ + wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); + } + } + + for (var names : keywordExtractor.getProperNames(sent)) { + WordRep rep = new WordRep(sent, names); + byte meta = metadata.getMetadataForWord(rep.stemmed); + + wordsBuilder.addMeta(rep.word, meta); + } + } + + pos++; // we need to add one more position to account for the last word in the document + + for (var recorder : spanRecorders) { + wordsBuilder.addSpans(recorder.finish(pos)); + } + + return pos; + } + + void mapLinkTextPositions(int startPos, + DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + LinkTexts linkTexts) + { + int pos = startPos; + + SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT); + + LinkTexts.Iter iter = linkTexts.iterator(); + + while (iter.next()) { + + DocumentSentence sentence = iter.sentence(); + int count = iter.count(); + + // We repeat a link sentence a number of times that is a function of how many times it's been spotted + // as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we + // attenuate that a bit with math so we don't generate a needlessly large positions list + + final int repetitions = (int) Math.max(1, min(sqrt(count), 12)); + + for (int ci = 0; ci < repetitions; ci++) { + + for (var word : sentence) { + pos++; + + extLinkRecorder.update(sentence, pos); + + if (word.isStopWord()) { + continue; + } + + String w = word.wordLowerCase(); + if (matchesWordPattern(w)) { + /* Add information about term positions */ + wordsBuilder.addPos(w, pos); + + /* Add metadata for word */ + wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); + } + } + + // Add a break between sentences, to prevent them being registered as one long run-on sentence + extLinkRecorder.endCurrentSpan(pos + 1); + + // Also add some positional padding between separate link texts so we don't match across their boundaries + pos += 2; + } + } + + wordsBuilder.addSpans(extLinkRecorder.finish(pos)); + } + + boolean matchesWordPattern(String s) { + // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4} + + String wordPartSeparator = ".-_/:+*"; + + int i = 0; + + for (int run = 0; run < 15 && i < s.length(); run++, i++) { + char c = s.charAt(i); + if (c >= 'a' && c <= 'z') continue; + if (c >= 'A' && c <= 'Z') continue; + if (c >= '0' && c <= '9') continue; + break; + } + + if (i == 0) + return false; + + for (int j = 0; j < 5; j++) { + if (i == s.length()) return true; + + if (wordPartSeparator.indexOf(s.charAt(i)) < 0) { + return false; + } + + i++; + + for (int run = 0; run < 10 && i < s.length(); run++, i++) { + char c = s.charAt(i); + if (c >= 'a' && c <= 'z') continue; + if (c >= 'A' && c <= 'Z') continue; + if (c >= '0' && c <= '9') continue; + break; + } + } + + return false; + } + + /** Helper class to record spans of words */ + private static class SpanRecorder { + private final List spans = new ArrayList<>(); + private final HtmlTag htmlTag; + private int start = 0; + + public SpanRecorder(HtmlTag htmlTag) { + this.htmlTag = htmlTag; + } + + public void update(DocumentSentence sentence, int pos) { + assert pos > 0; + + if (sentence.htmlTags.contains(htmlTag)) { + if (start <= 0) start = pos; + } + else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) + { + // special case for body tag, we match against no tag on the sentence + if (start <= 0) start = pos; + } + else { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); + start = 0; + } + } + } + + public void endCurrentSpan(int pos) { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); + start = 0; + } + } + + public List finish(int length) { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); + start = 0; + } + return spans; + } + } +} diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java index 021bbbb0..1b1e5571 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java @@ -6,7 +6,7 @@ import nu.marginalia.keyword.extractors.TitleKeywords; import nu.marginalia.keyword.extractors.UrlKeywords; import nu.marginalia.model.idx.WordFlags; -class KeywordMetadata { +public class KeywordMetadata { private final TitleKeywords titleKeywords; private final NameLikeKeywords nameLikeKeywords; diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java index 0251c168..f2501930 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java @@ -5,13 +5,12 @@ import gnu.trove.list.array.TIntArrayList; import nu.marginalia.language.model.DocumentSentence; import org.jetbrains.annotations.NotNull; -import java.util.Iterator; import java.util.List; public record LinkTexts( List linkTexts, TIntList counts -) implements Iterable { +) { public LinkTexts() { this(List.of(), new TIntArrayList()); } @@ -21,8 +20,21 @@ public record LinkTexts( } @NotNull - @Override - public Iterator iterator() { - return linkTexts.iterator(); + public LinkTexts.Iter iterator() { + return new Iter(); + } + + public class Iter { + private int pos = -1; + + public boolean next() { + return ++pos < length(); + } + public int count() { + return counts.get(pos); + } + public DocumentSentence sentence() { + return linkTexts.get(pos); + } } } diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 74a424ef..6d2a4df5 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -17,7 +17,7 @@ import java.util.*; public class DocumentKeywordsBuilder { public final Object2ByteOpenHashMap wordToMeta; public final HashMap wordToPos; - public final Map> wordSpans = new HashMap<>(); + public final Map> wordSpans = new HashMap<>(); /** * These ware keywords that had signals of high relevance @@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder { positionsForTag.add(span.end()); } - spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag))); + spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag))); }); return new DocumentKeywords(wordArray, meta.toArray(), positions, spans); @@ -128,7 +128,7 @@ public class DocumentKeywordsBuilder { public void addSpans(List newSpans) { for (var span : newSpans) { - wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span); + wordSpans.computeIfAbsent(span.tag(), k -> new ArrayList<>()).add(span); } } diff --git a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java index 83996e41..5f25f8ed 100644 --- a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java +++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -25,21 +25,6 @@ class DocumentKeywordExtractorTest { static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(); static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - @Test - public void testWordPattern() { - Assertions.assertTrue(extractor.matchesWordPattern("test")); - Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde")); - Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef")); - - Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test")); - Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test")); - Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24")); - Assertions.assertTrue(extractor.matchesWordPattern("std::vector")); - Assertions.assertTrue(extractor.matchesWordPattern("c++")); - Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h")); - Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse")); - } - @Test public void testKeyboards2() throws IOException, URISyntaxException { var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), diff --git a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java new file mode 100644 index 00000000..a00dd3ae --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java @@ -0,0 +1,184 @@ +package nu.marginalia.keyword; + +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.WmsaHome; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class DocumentPositionMapperTest { + private final DocumentPositionMapper positionMapper = new DocumentPositionMapper(); + static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + + @Test + public void testWordPattern() { + Assertions.assertTrue(positionMapper.matchesWordPattern("test")); + Assertions.assertTrue(positionMapper.matchesWordPattern("1234567890abcde")); + Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef")); + + Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test")); + Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test")); + Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24")); + Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector")); + Assertions.assertTrue(positionMapper.matchesWordPattern("c++")); + Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h")); + Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse")); + } + + @Test + public void testBasic() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + DocumentLanguageData dld = new DocumentLanguageData( + se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)), + "I am a teapot" + ); + + int pos = positionMapper.mapDocumentPositions(keywordsBuilder, Mockito.mock(KeywordMetadata.class), dld); + + assertEquals(8, pos); + assertEquals(IntList.of(1), keywordsBuilder.wordToPos.get("i")); + assertEquals(IntList.of(2), keywordsBuilder.wordToPos.get("am")); + assertEquals(IntList.of(3), keywordsBuilder.wordToPos.get("a")); + assertEquals(IntList.of(4), keywordsBuilder.wordToPos.get("teapot")); + assertEquals(IntList.of(5), keywordsBuilder.wordToPos.get("short")); + assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("and")); + assertEquals(IntList.of(7), keywordsBuilder.wordToPos.get("stout")); + + var codeSpans = keywordsBuilder.wordSpans.get(HtmlTag.CODE); + assertEquals(1, codeSpans.size()); + var codeSpan = codeSpans.getFirst(); + + assertEquals(1, codeSpan.start()); + assertEquals(8, codeSpan.end()); + } + + + @Test + public void testLinksSingleWord1Rep() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + + var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + assertEquals(1, sentences.size()); + TIntList counts = new TIntArrayList(new int[] { 1 }); + + positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class), + new LinkTexts(sentences, counts)); + + assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda")); + + var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT); + assertEquals(1, linkTextSpans.size()); + var codeSpan = linkTextSpans.getFirst(); + + assertEquals(6, codeSpan.start()); + assertEquals(7, codeSpan.end()); + } + + @Test + public void testLinksSingleWord2Reps() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + + var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + assertEquals(1, sentences.size()); + TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts) + + positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class), + new LinkTexts(sentences, counts)); + + assertEquals(IntList.of(6, 9), keywordsBuilder.wordToPos.get("zelda")); + + var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT); + assertEquals(2, linkTextSpans.size()); + + DocumentKeywordsBuilder.DocumentWordSpan span; + span = linkTextSpans.get(0); + + assertEquals(6, span.start()); + assertEquals(7, span.end()); + + span = linkTextSpans.get(1); + + assertEquals(9, span.start()); + assertEquals(10, span.end()); + } + + @Test + public void testLinksTwoWords2Reps() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + + var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + assertEquals(1, sentences.size()); + TIntList counts = new TIntArrayList(new int[] { 4 }); + + positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class), + new LinkTexts(sentences, counts)); + + assertEquals(IntList.of(6, 10), keywordsBuilder.wordToPos.get("zelda")); + assertEquals(IntList.of(7, 11), keywordsBuilder.wordToPos.get("ii")); + + var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT); + assertEquals(2, linkTextSpans.size()); + + DocumentKeywordsBuilder.DocumentWordSpan span; + span = linkTextSpans.get(0); + + assertEquals(6, span.start()); + assertEquals(8, span.end()); + + span = linkTextSpans.get(1); + + assertEquals(10, span.start()); + assertEquals(12, span.end()); + } + + + @Test + public void testLinksTwoSent1Word1Rep() { + DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder(); + + var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + assertEquals(1, sentences1.size()); + assertEquals(1, sentences2.size()); + TIntList counts = new TIntArrayList(new int[] { 1, 1 }); + + List sentencesAll = new ArrayList<>(); + sentencesAll.addAll(sentences1); + sentencesAll.addAll(sentences2); + + positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class), + new LinkTexts(sentencesAll, counts)); + + assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda")); + assertEquals(IntList.of(9), keywordsBuilder.wordToPos.get("link")); + + var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT); + assertEquals(2, linkTextSpans.size()); + + DocumentKeywordsBuilder.DocumentWordSpan span; + span = linkTextSpans.get(0); + + assertEquals(6, span.start()); + assertEquals(7, span.end()); + + span = linkTextSpans.get(1); + + assertEquals(9, span.start()); + assertEquals(10, span.end()); + } + + +} \ No newline at end of file From 3c2bb566da0220c65dc3929a21dd62a05e1e2f3e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 10 Dec 2024 13:41:05 +0100 Subject: [PATCH 06/15] (converter) Wipe the converter output path on initialization to avoid lingering stale data. --- .../writer/ConverterBatchWriter.java | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index 785318d9..9dba2444 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -12,6 +12,7 @@ import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.model.processed.SlopDomainLinkRecord; import nu.marginalia.model.processed.SlopDomainRecord; import nu.marginalia.sequence.VarintCodedSequence; +import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,20 +33,26 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class); public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException { - if (!Files.exists(ProcessedDataFileNames.domainFileName(basePath))) { - Files.createDirectory(ProcessedDataFileNames.domainFileName(basePath)); - } - domainWriter = new SlopDomainRecord.Writer(ProcessedDataFileNames.domainFileName(basePath), batchNumber); + Path domainPath = initSlopDir(ProcessedDataFileNames.domainFileName(basePath)); + Path linksPath = initSlopDir(ProcessedDataFileNames.domainLinkFileName(basePath)); + Path docsPath = initSlopDir(ProcessedDataFileNames.documentFileName(basePath)); - if (!Files.exists(ProcessedDataFileNames.domainLinkFileName(basePath))) { - Files.createDirectory(ProcessedDataFileNames.domainLinkFileName(basePath)); - } - domainLinkWriter = new SlopDomainLinkRecord.Writer(ProcessedDataFileNames.domainLinkFileName(basePath), batchNumber); + domainWriter = new SlopDomainRecord.Writer(domainPath, batchNumber); + domainLinkWriter = new SlopDomainLinkRecord.Writer(linksPath, batchNumber); + documentWriter = new SlopDocumentRecord.Writer(docsPath, batchNumber); + } - if (!Files.exists(ProcessedDataFileNames.documentFileName(basePath))) { - Files.createDirectory(ProcessedDataFileNames.documentFileName(basePath)); + private Path initSlopDir(Path p) throws IOException { + if (Files.isDirectory(p)) { + FileUtils.deleteDirectory(p.toFile()); } - documentWriter = new SlopDocumentRecord.Writer(ProcessedDataFileNames.documentFileName(basePath), batchNumber); + else if (Files.exists(p)) { + Files.delete(p); + } + + Files.createDirectories(p); + + return p; } /** Sets the lowest ordinal value for the documents in this batch */ @@ -114,7 +121,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter documentWriter.write(new SlopDocumentRecord( domainName, document.url.toString(), - ordinal, + ordinal++, document.state.toString(), document.stateReason, document.details.title, @@ -132,17 +139,15 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter spanCodes, spanSequences )); - - ordinal++; } } - private Object writeLinkData(ProcessedDomain domain) throws IOException { + private void writeLinkData(ProcessedDomain domain) throws IOException { String from = domain.domain.toString(); if (domain.documents == null) - return this; + return; Set seen = new HashSet<>(); @@ -171,10 +176,9 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter )); } - return this; } - public Object writeDomainData(ProcessedDomain domain) throws IOException { + public void writeDomainData(ProcessedDomain domain) throws IOException { DomainMetadata metadata = DomainMetadata.from(domain); List feeds = getFeedUrls(domain); @@ -191,8 +195,6 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter feeds ) ); - - return this; } private List getFeedUrls(ProcessedDomain domain) { From c5d657ef9800db68ade4819a8e04f0ea3bc1073e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 10 Dec 2024 13:42:10 +0100 Subject: [PATCH 07/15] (live-crawler) Flag live crawled documents with a special keyword --- .../converting/processor/DocumentDecorator.java | 4 ++++ .../converting/processor/DomainProcessor.java | 16 +++++++++++++++- .../marginalia/livecrawler/LiveCrawlerMain.java | 3 ++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java index 2a4fbcb1..2eb073b9 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.processor; import nu.marginalia.converting.model.ProcessedDocument; +import java.util.Collection; import java.util.HashSet; import java.util.Set; @@ -14,6 +15,9 @@ public class DocumentDecorator { public void addTerm(String term) { extraSearchTerms.add(term); } + public void addTerms(Collection terms) { + extraSearchTerms.addAll(terms); + } public void apply(ProcessedDocument doc) { if (doc == null) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java index c0999c96..d31195f8 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -66,6 +66,16 @@ public class DomainProcessor { return fullProcessing(domain); } + public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) { + try { + return new SideloadProcessing(dataStream, sizeHint, extraKeywords); + } + catch (Exception ex) { + logger.warn("Failed to process domain sideload", ex); + return null; + } + } + public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) { try { return new SideloadProcessing(dataStream, sizeHint); @@ -74,7 +84,6 @@ public class DomainProcessor { logger.warn("Failed to process domain sideload", ex); return null; } - } public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource { @@ -89,6 +98,10 @@ public class DomainProcessor { ); SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException { + this(dataStream, sizeHint, List.of()); + } + + SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) throws IOException { this.dataStream = dataStream; if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain)) @@ -100,6 +113,7 @@ public class DomainProcessor { domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint; documentDecorator = new DocumentDecorator(); + documentDecorator.addTerms(extraKeywords); processDomain(crawledDomain, domain, documentDecorator); diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java index d05925bb..f8af9267 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java @@ -41,6 +41,7 @@ import java.time.temporal.ChronoUnit; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX; @@ -196,7 +197,7 @@ public class LiveCrawlerMain extends ProcessMainClass { writer.setOrdinalOffset(67_000_000); for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) { - writer.write(domainProcessor.sideloadProcessing(stream, 0)); + writer.write(domainProcessor.sideloadProcessing(stream, 0, Set.of("special:live"))); } } From 9fc82574f0d23ffea806c5f86d38a57302e1ea7c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 10 Dec 2024 13:51:42 +0100 Subject: [PATCH 08/15] (fingerprint) Add FluxGarden as a wiki generator #130 --- .../processor/logic/DocumentGeneratorExtractor.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index c3c9eac4..cfc333c7 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -57,6 +57,7 @@ public class DocumentGeneratorExtractor { case "one.com": case "wix.com": case "wpbakery": + case "FluxGarden": return DocumentGenerator.of(parts[0]); case "adobe": case "microsoft": @@ -183,7 +184,7 @@ public class DocumentGeneratorExtractor { return DocumentGenerator.of("apache"); } if (header.contains("server: cowboy")) { - return DocumentGenerator.of("cowboy"); // erlang, really?! + return DocumentGenerator.of("cowboy"); // erlang, apparently } } @@ -281,7 +282,7 @@ public class DocumentGeneratorExtractor { -> GeneratorType.FORUM; case "mediawiki", "dokuwiki", "wikidot", "sharepoint" -> GeneratorType.WIKI; - case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc" + case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden" -> GeneratorType.DOCS; case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass" -> GeneratorType.ECOMMERCE_AND_SPAM; From cf7f84f0330ae70b225d3e80782f680338e9d09f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 10 Dec 2024 22:04:12 +0100 Subject: [PATCH 09/15] (rank) Reduce the impact of domain rank bonus, and only apply it to cancel out negative penalties, never to increase the ranking --- .../searchquery/model/results/ResultRankingParameters.java | 2 +- .../index/results/IndexResultScoreCalculator.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java index cba98152..575af8cf 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java @@ -69,7 +69,7 @@ public class ResultRankingParameters { .bm25Params(new Bm25Parameters(1.2, 0.5)) .shortDocumentThreshold(2000) .shortDocumentPenalty(2.) - .domainRankBonus(1 / 25.) + .domainRankBonus(1 / 100.) .qualityPenalty(1 / 15.) .shortSentenceThreshold(2) .shortSentencePenalty(5) diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 10f4ea05..20020735 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -129,9 +129,9 @@ public class IndexResultScoreCalculator { double score = normalize( score_firstPosition + score_proximity + score_verbatim + score_bM25 - + score_bFlags - + Math.max(0, documentBonus), - -Math.min(0, documentBonus)); + + score_bFlags, + -Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0 + ); if (Double.isNaN(score)) { // This should never happen but if it does, we want to know about it if (getClass().desiredAssertionStatus()) { From 461bc3eb1ab419afe8a8a16516dc7b1a3b85af39 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 10 Dec 2024 22:22:52 +0100 Subject: [PATCH 10/15] (generator) Add special workaround to flag fextralife as a wiki --- .../processor/logic/DocumentGeneratorExtractor.java | 11 +++++++++-- .../processor/plugin/HtmlDocumentProcessorPlugin.java | 2 +- .../specialization/JavadocSpecializationTest.java | 5 +++-- .../specialization/LemmySpecializationTest.java | 8 +++++--- .../specialization/XenForoSpecializationTest.java | 6 ++++-- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index cfc333c7..e6a87089 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic; import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.model.GeneratorType; +import nu.marginalia.model.EdgeUrl; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; @@ -13,7 +14,12 @@ import java.util.List; public class DocumentGeneratorExtractor { private static final String defaultValue = "unset"; - public DocumentGenerator detectGenerator(Document doc, DocumentHeaders responseHeaders) { + public DocumentGenerator detectGenerator(EdgeUrl url, Document doc, DocumentHeaders responseHeaders) { + + // Fextralife leaves no known tech fingerprint, but we know it's a wiki software of some sort + if (url.domain.toString().endsWith(".wiki.fextralife.com")) { + return DocumentGenerator.of("wiki"); + } var tags = doc.select("meta[name=generator]"); @@ -69,6 +75,7 @@ public class DocumentGeneratorExtractor { } } + if (parts.length > 1) { return DocumentGenerator.of(parts[0], parts[0] + "_" + truncVersion(parts[1])); } @@ -282,7 +289,7 @@ public class DocumentGeneratorExtractor { -> GeneratorType.FORUM; case "mediawiki", "dokuwiki", "wikidot", "sharepoint" -> GeneratorType.WIKI; - case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden" + case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden", "wiki" -> GeneratorType.DOCS; case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass" -> GeneratorType.ECOMMERCE_AND_SPAM; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 09b4a360..e27d0f68 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -129,7 +129,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin final EdgeUrl url = new EdgeUrl(crawledDocument.url); final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers); - final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, documentHeaders); + final var generatorParts = documentGeneratorExtractor.detectGenerator(url, doc, documentHeaders); final var specialization = htmlProcessorSpecializations.select(generatorParts, url); diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java index 253fc673..1b162790 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; @@ -34,8 +35,8 @@ class JavadocSpecializationTest { } @Test - void generatorExtraction() { - var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders("")); + void generatorExtraction() throws Exception { + var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders("")); System.out.println(gen); } diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java index 178796df..77d3fc05 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java @@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization; import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import java.net.URISyntaxException; import java.util.Set; class LemmySpecializationTest { @@ -37,9 +39,9 @@ class LemmySpecializationTest { } @Test - void generatorExtraction() { - var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), new DocumentHeaders("")); - var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), new DocumentHeaders("")); + void generatorExtraction() throws URISyntaxException { + var generatorIndex = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyIndexHtml), new DocumentHeaders("")); + var generatorPost = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyPost), new DocumentHeaders("")); System.out.println(generatorIndex); System.out.println(generatorPost); diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java index 3efd2900..c4005c06 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java @@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization; import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import java.net.URISyntaxException; import java.util.Set; class XenForoSpecializationTest { @@ -34,8 +36,8 @@ class XenForoSpecializationTest { } @Test - void generatorExtraction() { - var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders("")); + void generatorExtraction() throws URISyntaxException { + var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders("")); System.out.println(gen); } From 73861e613fc491a90efc4fdfb12c99e856aa5a6d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 11 Dec 2024 15:44:29 +0100 Subject: [PATCH 11/15] (ranking) Downtune score boost for unordered heading matces --- .../nu/marginalia/index/results/IndexResultScoreCalculator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 20020735..788f8705 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -454,7 +454,7 @@ public class IndexResultScoreCalculator { for (int i = 0; i < weights.length; i++) { weights[i] = switch(HtmlTag.includedTags[i]) { case TITLE -> 2.5f; - case HEADING -> 2.5f; + case HEADING -> 1.25f; case ANCHOR -> 0.2f; case NAV -> 0.1f; case CODE -> 0.25f; From 5002870d1f35572aedd8f0c23487a7003882dc03 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 11 Dec 2024 16:01:38 +0100 Subject: [PATCH 12/15] (converter) Refactor sideloaders to improve feature handling and keyword logic Centralized HTML feature handling with `applyFeatures` in StackexchangeSideloader and added dynamic synthetic term generation. Improved HTML structure in RedditSideloader and enhanced metadata processing with feature-based keywords. Updated DomainLinks to correctly compute link counts using individual link occurrences. --- .../marginalia/atags/model/DomainLinks.java | 8 +++- .../sideload/reddit/RedditSideloader.java | 41 ++++++++++++------- .../StackexchangeSideloader.java | 26 +++++++----- 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java index 14e6ad99..0d6d8a8d 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java @@ -41,7 +41,13 @@ public class DomainLinks { /** Returns the number of links to the given url. */ public int countForUrl(EdgeUrl url) { String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param); - return links.getOrDefault(key, List.of()).size(); + + int cnt = 0; + for (var link : links.getOrDefault(key, List.of())) { + cnt += link.count(); + } + + return cnt; } @Override diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java index 61ccf09f..61fc9e32 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java @@ -2,7 +2,6 @@ package nu.marginalia.converting.sideload.reddit; import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.model.DomainLinks; -import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; @@ -13,7 +12,7 @@ import nu.marginalia.integration.reddit.db.RedditDb; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.util.ProcessingIterator; import org.apache.commons.lang3.StringUtils; @@ -30,16 +29,13 @@ public class RedditSideloader implements SideloadSource { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class); private final List dbFiles; - private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final AnchorTextKeywords anchorTextKeywords; private final SideloaderProcessing sideloaderProcessing; public RedditSideloader(List listToDbFiles, - AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTextKeywords anchorTextKeywords, SideloaderProcessing sideloaderProcessing) { this.dbFiles = listToDbFiles; - this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.anchorTextKeywords = anchorTextKeywords; this.sideloaderProcessing = sideloaderProcessing; } @@ -116,14 +112,25 @@ public class RedditSideloader implements SideloadSource { .ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC) .getYear(); - String fullHtml = "\n\n\n " + title + "\n \n\n\n

" + title + "

\n
\n

" + body + "

\n
\n\n\n"; + String fullHtml = """ + + + + %s + + + +

%s

+

reddit r/%s %s

+
+

%s

+
+ + + """.formatted(title, title, subreddit, subreddit, body); List extraKeywords = new ArrayList<>(); - extraKeywords.add("reddit"); - extraKeywords.add(subreddit); - extraKeywords.add("r/" + subreddit); - if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) { extraKeywords.add(author); } @@ -147,12 +154,18 @@ public class RedditSideloader implements SideloadSource { if (doc.isProcessedFully()) { - for (var keyword : extraKeywords) { - doc.words.addMeta(keyword, WordFlags.Subjects.asBit()); + // Insert topology information + if (doc.details != null) { + doc.details.metadata.withSizeAndTopology(50_000_000, score); } - // Insert topology information - doc.details.metadata.withSizeAndTopology(50_000_000, score); + if (doc.words != null) { + doc.words.addAllSyntheticTerms(List.of("generator:forum", + HtmlFeature.COOKIES.getKeyword(), + HtmlFeature.JS.getKeyword(), + HtmlFeature.TRACKING_ADTECH.getKeyword() + )); + } } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index bf4d21f1..c42443b3 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -24,10 +24,7 @@ import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import java.nio.file.Path; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.Iterator; -import java.util.List; +import java.util.*; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.TimeUnit; @@ -36,6 +33,8 @@ public class StackexchangeSideloader implements SideloadSource { private final DocumentKeywordExtractor keywordExtractor; private final String domainName; + private final EnumSet applyFeatures = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING); + private final Path dbFile; public StackexchangeSideloader(Path pathToDbFile, @@ -133,12 +132,17 @@ public class StackexchangeSideloader implements SideloadSource { ret.url = url; ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url); - ret.words.addAllSyntheticTerms(List.of( - "site:" + domainName, - "site:" + url.domain.topDomain, - url.domain.topDomain, - domainName - )); + + List syntheticTerms = new ArrayList<>( + List.of("site:" + domainName, + "site:" + url.domain.topDomain, + url.domain.topDomain, + domainName) + ); + for (HtmlFeature feature : applyFeatures) { + syntheticTerms.add(feature.getKeyword()); + } + ret.words.addAllSyntheticTerms(syntheticTerms); if (!post.tags().isBlank()) { List subjects = Arrays.asList(post.tags().split(",")); @@ -152,7 +156,7 @@ public class StackexchangeSideloader implements SideloadSource { PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.of(DocumentFlags.GeneratorDocs)); - ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING); + ret.details.features = applyFeatures; ret.details.metadata.withSizeAndTopology(10000, 0); From a97c05107ebcf78457571ee82d69d6e5f2ed2e96 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 11 Dec 2024 16:10:44 +0100 Subject: [PATCH 13/15] Add synthetic meta flag for root path documents If the document's URL path is "/", a "special:root" meta flag is now added with the "Synthetic" bit set. This will help searching only for the root document of a website, neat stuff ahead :D --- .../marginalia/converting/processor/DocumentProcessor.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java index 36eae72a..d1e4d495 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -15,6 +15,7 @@ import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.crawldata.CrawlerDocumentStatus; +import nu.marginalia.model.idx.WordFlags; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -118,6 +119,10 @@ public class DocumentProcessor { ret.details = detailsWithWords.details(); ret.words = detailsWithWords.words(); + if (url.path.equals("/")) { + ret.words.addMeta("special:root", WordFlags.Synthetic.asBit()); + } + documentDecorator.apply(ret); if (Boolean.TRUE.equals(crawledDocument.hasCookies) From 3b99cffb3dcfa459aa1cb8cdb2b2a4ed5a05b0f3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 11 Dec 2024 16:42:47 +0100 Subject: [PATCH 14/15] (link-parser) Filter out URLs with binary file suffixes in LinkParser Added an additional filter step to ensure URLs with binary suffixes are excluded during crawling. This prevents unnecessary processing of non-HTML content, improving the efficiency of the link parsing process. --- .../java/nu/marginalia/link_parser/LinkParser.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java b/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java index 8a04863d..717ae8a5 100644 --- a/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java +++ b/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java @@ -42,7 +42,8 @@ public class LinkParser { .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) - .flatMap(this::createEdgeUrl); + .flatMap(this::createEdgeUrl) + .filter(url -> !hasBinarySuffix(url.path)); } @Contract(pure=true) From e65d75a0f9835295de15ef227041ac728133cf10 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 11 Dec 2024 17:01:52 +0100 Subject: [PATCH 15/15] (crawler) Reintroduce content type probing and clean out bad content type data from the existing crawl sets --- .../sideload/SideloadSourceFactory.java | 3 +- .../crawl/retreival/CrawlDataReference.java | 4 +++ .../crawl/retreival/CrawlerRetreiver.java | 32 +++++++++---------- .../java/nu/marginalia/ContentTypes.java | 22 +++++++++++++ ...rawledDocumentParquetRecordFileWriter.java | 16 +++++++++- .../retreival/fetcher/WarcRecorderTest.java | 4 +-- 6 files changed, 59 insertions(+), 22 deletions(-) create mode 100644 code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index 8c6e92d2..f3c6227d 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -65,8 +65,7 @@ public class SideloadSourceFactory { public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException { return sideload(pathToDbFiles, new PathSuffixPredicate(".db"), - (List paths) -> new RedditSideloader(paths, - anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing)); + (List paths) -> new RedditSideloader(paths, anchorTextKeywords, sideloaderProcessing)); } public Collection sideloadStackexchange(Path pathToDbFileRoot) throws IOException { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index b0b2c014..98133bcf 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -1,5 +1,6 @@ package nu.marginalia.crawl.retreival; +import nu.marginalia.ContentTypes; import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.lsh.EasyLSH; import nu.marginalia.model.crawldata.CrawledDocument; @@ -43,6 +44,9 @@ public class CrawlDataReference implements AutoCloseable { try { while (data.hasNext()) { if (data.next() instanceof CrawledDocument doc) { + if (!ContentTypes.isAccepted(doc.contentType)) + continue; + return doc; } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index c6b426b3..ace2059b 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -317,26 +317,24 @@ public class CrawlerRetreiver implements AutoCloseable { long probeStart = System.currentTimeMillis(); - /* - probing is on probation for now while we evaluate how much the added delays slows down the crawler - if (probeType == HttpFetcher.ProbeType.FULL) { + retryLoop: for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { try { var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags); - if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Ok ok) { - url = ok.resolvedUrl(); // If we were redirected while probing, use the final URL for fetching - break; - } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType badContentType) { - return new HttpFetchResult.ResultNone(); - } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout) { - return new HttpFetchResult.ResultException(timeout.ex()); - } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Exception exception) { - return new HttpFetchResult.ResultException(exception.ex()); - } - else { // should be unreachable - throw new IllegalStateException("Unknown probe result"); + switch (probeResult) { + case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl): + url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching + break retryLoop; + case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType: + return new HttpFetchResult.ResultNone(); + case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout: + return new HttpFetchResult.ResultException(timeout.ex()); + case HttpFetcher.ContentTypeProbeResult.Exception exception: + return new HttpFetchResult.ResultException(exception.ex()); + default: // should be unreachable + throw new IllegalStateException("Unknown probe result"); } } catch (HttpFetcherImpl.RateLimitException ex) { @@ -348,8 +346,8 @@ public class CrawlerRetreiver implements AutoCloseable { } } - timer.waitFetchDelay(System.currentTimeMillis() - probeStart); - }*/ + timer.waitFetchDelay(System.currentTimeMillis() - probeStart); + } for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { diff --git a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java new file mode 100644 index 00000000..dbc1989c --- /dev/null +++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java @@ -0,0 +1,22 @@ +package nu.marginalia; + +import java.util.Set; + +public class ContentTypes { + public static final Set acceptedContentTypes = Set.of("application/xhtml+xml", + "application/xhtml", + "text/html", + "image/x-icon", + "text/plain"); + + public static boolean isAccepted(String contentTypeHeader) { + String lcHeader = contentTypeHeader.toLowerCase(); + for (var type : acceptedContentTypes) { + if (lcHeader.startsWith(type)) { + return true; + } + } + return false; + } + +} diff --git a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java index f231c703..9474c2ff 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java @@ -1,6 +1,7 @@ package nu.marginalia.parquet.crawldata; import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.ContentTypes; import nu.marginalia.UserAgent; import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.DocumentBodyResult; @@ -62,6 +63,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { } } + + /** Return true if the WarcResponse should be excluded from conversion */ private static boolean filterResponse(String uaString, WarcResponse response) throws IOException { @@ -74,14 +77,25 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { return false; } - var robotsTags = response.http().headers().all("X-Robots-Tag"); + var headers = response.http().headers(); + var robotsTags = headers.all("X-Robots-Tag"); + if (!isXRobotsTagsPermitted(robotsTags, uaString)) { return false; } + // Strip out responses with content types we aren't interested in + // (though ideally we wouldn't download these at all) + String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase(); + + if (!ContentTypes.isAccepted(contentType)) { + return false; + } + return true; } + private void write(String domain, WarcXEntityRefused refused) throws IOException { URI profile = refused.profile(); diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index d6d407bf..b2a0f2bc 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -157,10 +157,10 @@ class WarcRecorderTest { fileNameParquet); var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList(); - assertEquals(3, urls.size()); + assertEquals(2, urls.size()); assertEquals("https://www.marginalia.nu/", urls.get(0)); assertEquals("https://www.marginalia.nu/log/", urls.get(1)); - assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2)); + // sanic.jpg gets filtered out for its bad mime type }