diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java
index cba98152..575af8cf 100644
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java
@@ -69,7 +69,7 @@ public class ResultRankingParameters {
.bm25Params(new Bm25Parameters(1.2, 0.5))
.shortDocumentThreshold(2000)
.shortDocumentPenalty(2.)
- .domainRankBonus(1 / 25.)
+ .domainRankBonus(1 / 100.)
.qualityPenalty(1 / 15.)
.shortSentenceThreshold(2)
.shortSentencePenalty(5)
diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java
index 5ab5d166..bf077683 100644
--- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java
+++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java
@@ -123,13 +123,13 @@ public class DocumentSpan {
/** Returns true if for any position in the list, there exists a range
* (position[i], position[i]+len] that is overlapped by a span */
- public boolean containsRangeExact(IntList positions, int len) {
+ public int containsRangeExact(IntList positions, int len) {
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
- return false;
+ return 0;
}
int sei = 0;
-
+ int cnt = 0;
int start = startsEnds.getInt(sei++);
int end = startsEnds.getInt(sei++);
@@ -138,7 +138,15 @@ public class DocumentSpan {
int position = positions.getInt(pi);
if (position == start && position + len == end) {
- return true;
+ cnt++;
+ if (sei + 2 <= startsEnds.size()) {
+ pi = 0;
+ start = startsEnds.getInt(sei++);
+ end = startsEnds.getInt(sei++);
+ }
+ else {
+ break;
+ }
}
else if (position < end) {
pi++;
@@ -147,11 +155,11 @@ public class DocumentSpan {
end = startsEnds.getInt(sei++);
}
else {
- return false;
+ break;
}
}
- return false;
+ return cnt;
}
public int countRangeMatches(IntList positions, int len) {
diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java
index f0170883..a5085c25 100644
--- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java
+++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java
@@ -115,16 +115,16 @@ class ForwardIndexSpansReaderTest {
) {
var spans1 = reader.readSpans(arena, offset1);
- assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2));
- assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
- assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(10), 2));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
- assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5));
- assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
- assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
+ assertEquals(1, spans1.heading.containsRangeExact(IntList.of(10), 5));
+ assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
+ assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
- assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5));
- assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(11), 5));
+ assertEquals(0, spans1.heading.containsRangeExact(IntList.of(9), 5));
}
}
diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
index 74ad0e60..788f8705 100644
--- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
+++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
@@ -129,9 +129,9 @@ public class IndexResultScoreCalculator {
double score = normalize(
score_firstPosition + score_proximity + score_verbatim
+ score_bM25
- + score_bFlags
- + Math.max(0, documentBonus),
- -Math.min(0, documentBonus));
+ + score_bFlags,
+ -Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
+ );
if (Double.isNaN(score)) { // This should never happen but if it does, we want to know about it
if (getClass().desiredAssertionStatus()) {
@@ -388,11 +388,13 @@ public class IndexResultScoreCalculator {
}
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
- if (extLinkSpan.length() == fullGroup.size
- && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
- {
- score += 2; // Add additional bonus if there's a single-word atag span
+ if (extLinkSpan.length() >= fullGroup.size) {
+ int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
+ if (cnt > 0) {
+ score += 2 * cnt;
+ }
}
+
return;
}
@@ -407,9 +409,9 @@ public class IndexResultScoreCalculator {
// Bonus if there's a perfect match with an atag span
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
- if (extLinkSpan.length() == fullGroup.size && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
- {
- score += 2;
+ if (extLinkSpan.length() >= fullGroup.size) {
+ int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
+ score += 2*cnt;
}
// For optional groups, we scale the score by the size of the group relative to the full group
@@ -420,7 +422,7 @@ public class IndexResultScoreCalculator {
IntList intersections = optionalGroup.findIntersections(positions);
for (var tag : HtmlTag.includedTags) {
- int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);;
+ int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
if (cnts > 0) {
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
}
@@ -452,12 +454,12 @@ public class IndexResultScoreCalculator {
for (int i = 0; i < weights.length; i++) {
weights[i] = switch(HtmlTag.includedTags[i]) {
case TITLE -> 2.5f;
- case HEADING -> 2.5f;
+ case HEADING -> 1.25f;
case ANCHOR -> 0.2f;
case NAV -> 0.1f;
case CODE -> 0.25f;
case BODY -> 1.0f;
- case EXTERNAL_LINKTEXT -> 0.75f;
+ case EXTERNAL_LINKTEXT -> 1.5f;
default -> 0.0f;
};
}
diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java
index 2e0b6bd7..2ee65d25 100644
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java
@@ -1,6 +1,8 @@
package nu.marginalia.atags;
import com.google.inject.Inject;
+import gnu.trove.list.TIntList;
+import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.model.Link;
import nu.marginalia.keyword.LinkTexts;
@@ -51,6 +53,7 @@ public class AnchorTextKeywords {
List keywordsRaw = links.forUrl(url);
List ret = new ArrayList<>(keywordsRaw.size());
+ TIntList counts = new TIntArrayList(keywordsRaw.size());
// Extract and count keywords from anchor text
for (Link keyword : keywordsRaw) {
@@ -59,18 +62,20 @@ public class AnchorTextKeywords {
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence);
+ counts.add(keyword.count());
}
- return new LinkTexts(ret);
+ return new LinkTexts(ret, counts);
}
public LinkTexts getAnchorTextKeywords(DomainLinks links, List urls) {
List keywordsRaw = new ArrayList<>();
for (var url : urls) {
- links.forUrl(url);
+ keywordsRaw.addAll(links.forUrl(url));
}
List ret = new ArrayList<>(keywordsRaw.size());
+ TIntList counts = new TIntArrayList(keywordsRaw.size());
// Extract and count keywords from anchor text
for (Link keyword : keywordsRaw) {
@@ -79,8 +84,9 @@ public class AnchorTextKeywords {
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence);
+ counts.add(keyword.count());
}
- return new LinkTexts(ret);
+ return new LinkTexts(ret, counts);
}
}
diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java
index 14e6ad99..0d6d8a8d 100644
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java
@@ -41,7 +41,13 @@ public class DomainLinks {
/** Returns the number of links to the given url. */
public int countForUrl(EdgeUrl url) {
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
- return links.getOrDefault(key, List.of()).size();
+
+ int cnt = 0;
+ for (var link : links.getOrDefault(key, List.of())) {
+ cnt += link.count();
+ }
+
+ return cnt;
}
@Override
diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java
index 1c76469f..66d1e977 100644
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java
@@ -1,4 +1,4 @@
package nu.marginalia.atags.model;
-public record Link(String source, String text) {
+public record Link(String text, int count) {
}
diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java
index 784580fc..55986949 100644
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java
@@ -1,7 +1,7 @@
package nu.marginalia.atags.model;
-public record LinkWithText(String url, String text, String source) {
+public record LinkWithText(String url, String text, int cnt) {
public Link toLink() {
- return new Link(source, text);
+ return new Link(text, cnt);
}
}
diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java
index c80a57c7..a15dfecd 100644
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java
@@ -80,7 +80,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
select
unnest(text) as 'text',
unnest(url) as 'url',
- unnest(source) as 'source'
+ unnest(cnt) as 'cnt'
from atags
where dest = ?
"""))
@@ -89,7 +89,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
ps.setString(1, domain.toString());
var rs = ps.executeQuery();
while (rs.next()) {
- links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source")));
+ links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getInt("cnt")));
}
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
@@ -102,7 +102,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
String url = rs.getString("url");
url = aliasDomain + url.substring(url.indexOf('/'));
- links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source")));
+ links.add(new LinkWithText(url, rs.getString("text"), rs.getInt("cnt")));
}
return new DomainLinks(links);
}
diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
index d0db9b7c..c4050d0a 100644
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
@@ -5,35 +5,29 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
-import nu.marginalia.language.model.DocumentSentence;
-import nu.marginalia.language.model.WordRep;
-import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
-import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
-import java.util.List;
import java.util.stream.Stream;
public class DocumentKeywordExtractor {
- private final KeywordExtractor keywordExtractor;
private final TermFrequencyDict dict;
+ private final KeywordExtractor keywordExtractor = new KeywordExtractor();
+ private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) {
this.dict = dict;
- this.keywordExtractor = new KeywordExtractor();
}
// for tests
public DocumentKeywordExtractor() {
try {
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
- this.keywordExtractor = new KeywordExtractor();
}
catch (Exception ex) {
throw new RuntimeException(ex);
@@ -60,7 +54,7 @@ public class DocumentKeywordExtractor {
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
- createSimpleWords(wordsBuilder, keywordMetadata, dld, linkTexts);
+ positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
@@ -106,176 +100,4 @@ public class DocumentKeywordExtractor {
}
}
- private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
- KeywordMetadata metadata,
- DocumentLanguageData dld,
- LinkTexts linkTexts)
- {
- // we use 1-based indexing since the data
- // will be gamma encoded, and it can't represent 0
- int pos = 0;
-
- List spanRecorders = new ArrayList<>();
- for (var htmlTag : HtmlTag.includedTags) {
- if (!htmlTag.exclude) {
- spanRecorders.add(new SpanRecorder(htmlTag));
- }
- }
-
- for (DocumentSentence sent : dld) {
- for (var word : sent) {
- pos++;
-
- for (var recorder : spanRecorders) {
- recorder.update(sent, pos);
- }
-
- if (word.isStopWord()) {
- continue;
- }
-
- String w = word.wordLowerCase();
- if (matchesWordPattern(w)) {
- /* Add information about term positions */
- wordsBuilder.addPos(w, pos);
-
- /* Add metadata for word */
- wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
- }
- }
-
- for (var names : keywordExtractor.getProperNames(sent)) {
- var rep = new WordRep(sent, names);
-
- byte meta = metadata.getMetadataForWord(rep.stemmed);
-
- wordsBuilder.addMeta(rep.word, meta);
- }
- }
-
- pos++; // we need to add one more position to account for the last word in the document
-
- for (var recorder : spanRecorders) {
- wordsBuilder.addSpans(recorder.finish(pos));
-
- // reset the recorder, so we can use it again without adding the same positions twice
- recorder.reset();
- }
-
- // Next add synthetic positions to the document for anchor texts
-
- pos += 2; // add some padding to the end of the document before we start adding a-tag words
-
- for (var linkText : linkTexts) {
-
- for (var word : linkText) {
- pos++;
-
- for (var recorder : spanRecorders) {
- recorder.update(linkText, pos);
- }
-
- if (word.isStopWord()) {
- continue;
- }
-
- String w = word.wordLowerCase();
- if (matchesWordPattern(w)) {
- /* Add information about term positions */
- wordsBuilder.addPos(w, pos);
-
- /* Add metadata for word */
- wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
- }
- }
-
- // add some padding between separate link texts so we don't match across their boundaries
- pos+=2;
- }
-
- for (var recorder : spanRecorders) {
- wordsBuilder.addSpans(recorder.finish(pos));
- }
- }
-
- boolean matchesWordPattern(String s) {
- // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
-
- String wordPartSeparator = ".-_/:+*";
-
- int i = 0;
-
- for (int run = 0; run < 15 && i < s.length(); run++, i++) {
- char c = s.charAt(i);
- if (c >= 'a' && c <= 'z') continue;
- if (c >= 'A' && c <= 'Z') continue;
- if (c >= '0' && c <= '9') continue;
- break;
- }
-
- if (i == 0)
- return false;
-
- for (int j = 0; j < 5; j++) {
- if (i == s.length()) return true;
-
- if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
- return false;
- }
-
- i++;
-
- for (int run = 0; run < 10 && i < s.length(); run++, i++) {
- char c = s.charAt(i);
- if (c >= 'a' && c <= 'z') continue;
- if (c >= 'A' && c <= 'Z') continue;
- if (c >= '0' && c <= '9') continue;
- break;
- }
- }
-
- return false;
- }
-
- /** Helper class to record spans of words */
- private static class SpanRecorder {
- private List spans = new ArrayList<>();
- private final HtmlTag htmlTag;
- private int start = 0;
-
- public SpanRecorder(HtmlTag htmlTag) {
- this.htmlTag = htmlTag;
- }
-
- public void update(DocumentSentence sentence, int pos) {
- assert pos > 0;
-
- if (
- sentence.htmlTags.contains(htmlTag)
- || (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) // special case for body tag, we match against no tag on the sentence
- )
- {
- if (start <= 0) start = pos;
- }
- else {
- if (start > 0) {
- spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
- start = 0;
- }
- }
- }
-
- public List finish(int length) {
- if (start > 0) {
- spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
- start = 0;
- }
- return spans;
- }
-
- public void reset() {
- spans.clear();
- start = 0;
- }
- }
}
diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java
new file mode 100644
index 00000000..0644cf76
--- /dev/null
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java
@@ -0,0 +1,237 @@
+package nu.marginalia.keyword;
+
+import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
+import nu.marginalia.language.model.DocumentLanguageData;
+import nu.marginalia.language.model.DocumentSentence;
+import nu.marginalia.language.model.WordRep;
+import nu.marginalia.language.sentence.tag.HtmlTag;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static java.lang.Math.min;
+import static java.lang.Math.sqrt;
+
+/** DocumentPositionMapper is responsible for assigning keywords positions in the document,
+ * as well as recording spans of positions
+ */
+public class DocumentPositionMapper {
+
+ private final KeywordExtractor keywordExtractor = new KeywordExtractor();
+
+ public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
+ KeywordMetadata metadata,
+ DocumentLanguageData dld,
+ LinkTexts linkTexts)
+ {
+
+ // First map the words in the documnent to their positions
+ int pos = mapDocumentPositions(wordsBuilder, metadata, dld);
+
+ // Next create some padding space to avoid cross-matching
+ pos += 2;
+
+ // Finally allocate some virtual space after the end of the document
+ // for the link texts, so that we can match against them as well, although
+ // these will be given a different span type.
+ mapLinkTextPositions(pos, wordsBuilder, metadata, linkTexts);
+ }
+
+
+ int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
+ KeywordMetadata metadata,
+ DocumentLanguageData dld)
+
+ {
+
+ List spanRecorders = new ArrayList<>();
+ for (var htmlTag : HtmlTag.includedTags) {
+ if (!htmlTag.exclude) {
+ spanRecorders.add(new SpanRecorder(htmlTag));
+ }
+ }
+
+ // we use 1-based indexing since the data
+ // will be gamma encoded, and it can't represent 0;
+ // but the loop starts by incrementing the position,
+ // so while unintuitive, zero is correct here.
+ int pos = 0;
+
+ for (DocumentSentence sent : dld) {
+ for (var word : sent) {
+ pos++;
+
+ // Update span position tracking
+ for (var recorder : spanRecorders) {
+ recorder.update(sent, pos);
+ }
+
+ if (word.isStopWord()) {
+ continue;
+ }
+
+ String w = word.wordLowerCase();
+ if (matchesWordPattern(w)) {
+ /* Add information about term positions */
+ wordsBuilder.addPos(w, pos);
+
+ /* Add metadata for word */
+ wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
+ }
+ }
+
+ for (var names : keywordExtractor.getProperNames(sent)) {
+ WordRep rep = new WordRep(sent, names);
+ byte meta = metadata.getMetadataForWord(rep.stemmed);
+
+ wordsBuilder.addMeta(rep.word, meta);
+ }
+ }
+
+ pos++; // we need to add one more position to account for the last word in the document
+
+ for (var recorder : spanRecorders) {
+ wordsBuilder.addSpans(recorder.finish(pos));
+ }
+
+ return pos;
+ }
+
+ void mapLinkTextPositions(int startPos,
+ DocumentKeywordsBuilder wordsBuilder,
+ KeywordMetadata metadata,
+ LinkTexts linkTexts)
+ {
+ int pos = startPos;
+
+ SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
+
+ LinkTexts.Iter iter = linkTexts.iterator();
+
+ while (iter.next()) {
+
+ DocumentSentence sentence = iter.sentence();
+ int count = iter.count();
+
+ // We repeat a link sentence a number of times that is a function of how many times it's been spotted
+ // as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we
+ // attenuate that a bit with math so we don't generate a needlessly large positions list
+
+ final int repetitions = (int) Math.max(1, min(sqrt(count), 12));
+
+ for (int ci = 0; ci < repetitions; ci++) {
+
+ for (var word : sentence) {
+ pos++;
+
+ extLinkRecorder.update(sentence, pos);
+
+ if (word.isStopWord()) {
+ continue;
+ }
+
+ String w = word.wordLowerCase();
+ if (matchesWordPattern(w)) {
+ /* Add information about term positions */
+ wordsBuilder.addPos(w, pos);
+
+ /* Add metadata for word */
+ wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
+ }
+ }
+
+ // Add a break between sentences, to prevent them being registered as one long run-on sentence
+ extLinkRecorder.endCurrentSpan(pos + 1);
+
+ // Also add some positional padding between separate link texts so we don't match across their boundaries
+ pos += 2;
+ }
+ }
+
+ wordsBuilder.addSpans(extLinkRecorder.finish(pos));
+ }
+
+ boolean matchesWordPattern(String s) {
+ // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
+
+ String wordPartSeparator = ".-_/:+*";
+
+ int i = 0;
+
+ for (int run = 0; run < 15 && i < s.length(); run++, i++) {
+ char c = s.charAt(i);
+ if (c >= 'a' && c <= 'z') continue;
+ if (c >= 'A' && c <= 'Z') continue;
+ if (c >= '0' && c <= '9') continue;
+ break;
+ }
+
+ if (i == 0)
+ return false;
+
+ for (int j = 0; j < 5; j++) {
+ if (i == s.length()) return true;
+
+ if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
+ return false;
+ }
+
+ i++;
+
+ for (int run = 0; run < 10 && i < s.length(); run++, i++) {
+ char c = s.charAt(i);
+ if (c >= 'a' && c <= 'z') continue;
+ if (c >= 'A' && c <= 'Z') continue;
+ if (c >= '0' && c <= '9') continue;
+ break;
+ }
+ }
+
+ return false;
+ }
+
+ /** Helper class to record spans of words */
+ private static class SpanRecorder {
+ private final List spans = new ArrayList<>();
+ private final HtmlTag htmlTag;
+ private int start = 0;
+
+ public SpanRecorder(HtmlTag htmlTag) {
+ this.htmlTag = htmlTag;
+ }
+
+ public void update(DocumentSentence sentence, int pos) {
+ assert pos > 0;
+
+ if (sentence.htmlTags.contains(htmlTag)) {
+ if (start <= 0) start = pos;
+ }
+ else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
+ {
+ // special case for body tag, we match against no tag on the sentence
+ if (start <= 0) start = pos;
+ }
+ else {
+ if (start > 0) {
+ spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
+ start = 0;
+ }
+ }
+ }
+
+ public void endCurrentSpan(int pos) {
+ if (start > 0) {
+ spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
+ start = 0;
+ }
+ }
+
+ public List finish(int length) {
+ if (start > 0) {
+ spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
+ start = 0;
+ }
+ return spans;
+ }
+ }
+}
diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java
index 021bbbb0..1b1e5571 100644
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java
@@ -6,7 +6,7 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
import nu.marginalia.keyword.extractors.UrlKeywords;
import nu.marginalia.model.idx.WordFlags;
-class KeywordMetadata {
+public class KeywordMetadata {
private final TitleKeywords titleKeywords;
private final NameLikeKeywords nameLikeKeywords;
diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java
index c1ade6b4..f2501930 100644
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java
@@ -1,19 +1,40 @@
package nu.marginalia.keyword;
+import gnu.trove.list.TIntList;
+import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.language.model.DocumentSentence;
import org.jetbrains.annotations.NotNull;
-import java.util.Iterator;
import java.util.List;
-public record LinkTexts(List linkTexts) implements Iterable {
+public record LinkTexts(
+ List linkTexts,
+ TIntList counts
+) {
public LinkTexts() {
- this(List.of());
+ this(List.of(), new TIntArrayList());
+ }
+
+ public int length() {
+ return linkTexts.size();
}
@NotNull
- @Override
- public Iterator iterator() {
- return linkTexts.iterator();
+ public LinkTexts.Iter iterator() {
+ return new Iter();
+ }
+
+ public class Iter {
+ private int pos = -1;
+
+ public boolean next() {
+ return ++pos < length();
+ }
+ public int count() {
+ return counts.get(pos);
+ }
+ public DocumentSentence sentence() {
+ return linkTexts.get(pos);
+ }
}
}
diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java
index 74a424ef..6d2a4df5 100644
--- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java
+++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java
@@ -17,7 +17,7 @@ import java.util.*;
public class DocumentKeywordsBuilder {
public final Object2ByteOpenHashMap wordToMeta;
public final HashMap wordToPos;
- public final Map> wordSpans = new HashMap<>();
+ public final Map> wordSpans = new HashMap<>();
/**
* These ware keywords that had signals of high relevance
@@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder {
positionsForTag.add(span.end());
}
- spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag)));
+ spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
});
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
@@ -128,7 +128,7 @@ public class DocumentKeywordsBuilder {
public void addSpans(List newSpans) {
for (var span : newSpans) {
- wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span);
+ wordSpans.computeIfAbsent(span.tag(), k -> new ArrayList<>()).add(span);
}
}
diff --git a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java
index 83996e41..5f25f8ed 100644
--- a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java
+++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java
@@ -25,21 +25,6 @@ class DocumentKeywordExtractorTest {
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
- @Test
- public void testWordPattern() {
- Assertions.assertTrue(extractor.matchesWordPattern("test"));
- Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
- Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
-
- Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
- Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
- Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
- Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
- Assertions.assertTrue(extractor.matchesWordPattern("c++"));
- Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
- Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
- }
-
@Test
public void testKeyboards2() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
diff --git a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java
new file mode 100644
index 00000000..a00dd3ae
--- /dev/null
+++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java
@@ -0,0 +1,184 @@
+package nu.marginalia.keyword;
+
+import gnu.trove.list.TIntList;
+import gnu.trove.list.array.TIntArrayList;
+import it.unimi.dsi.fastutil.ints.IntList;
+import nu.marginalia.WmsaHome;
+import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
+import nu.marginalia.language.model.DocumentLanguageData;
+import nu.marginalia.language.model.DocumentSentence;
+import nu.marginalia.language.sentence.SentenceExtractor;
+import nu.marginalia.language.sentence.tag.HtmlTag;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
+
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class DocumentPositionMapperTest {
+ private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
+ static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
+
+ @Test
+ public void testWordPattern() {
+ Assertions.assertTrue(positionMapper.matchesWordPattern("test"));
+ Assertions.assertTrue(positionMapper.matchesWordPattern("1234567890abcde"));
+ Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
+
+ Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
+ Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
+ Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
+ Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
+ Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
+ Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
+ Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));
+ }
+
+ @Test
+ public void testBasic() {
+ DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
+ DocumentLanguageData dld = new DocumentLanguageData(
+ se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
+ "I am a teapot"
+ );
+
+ int pos = positionMapper.mapDocumentPositions(keywordsBuilder, Mockito.mock(KeywordMetadata.class), dld);
+
+ assertEquals(8, pos);
+ assertEquals(IntList.of(1), keywordsBuilder.wordToPos.get("i"));
+ assertEquals(IntList.of(2), keywordsBuilder.wordToPos.get("am"));
+ assertEquals(IntList.of(3), keywordsBuilder.wordToPos.get("a"));
+ assertEquals(IntList.of(4), keywordsBuilder.wordToPos.get("teapot"));
+ assertEquals(IntList.of(5), keywordsBuilder.wordToPos.get("short"));
+ assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("and"));
+ assertEquals(IntList.of(7), keywordsBuilder.wordToPos.get("stout"));
+
+ var codeSpans = keywordsBuilder.wordSpans.get(HtmlTag.CODE);
+ assertEquals(1, codeSpans.size());
+ var codeSpan = codeSpans.getFirst();
+
+ assertEquals(1, codeSpan.start());
+ assertEquals(8, codeSpan.end());
+ }
+
+
+ @Test
+ public void testLinksSingleWord1Rep() {
+ DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
+
+ var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
+ assertEquals(1, sentences.size());
+ TIntList counts = new TIntArrayList(new int[] { 1 });
+
+ positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
+ new LinkTexts(sentences, counts));
+
+ assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
+
+ var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
+ assertEquals(1, linkTextSpans.size());
+ var codeSpan = linkTextSpans.getFirst();
+
+ assertEquals(6, codeSpan.start());
+ assertEquals(7, codeSpan.end());
+ }
+
+ @Test
+ public void testLinksSingleWord2Reps() {
+ DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
+
+ var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
+ assertEquals(1, sentences.size());
+ TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
+
+ positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
+ new LinkTexts(sentences, counts));
+
+ assertEquals(IntList.of(6, 9), keywordsBuilder.wordToPos.get("zelda"));
+
+ var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
+ assertEquals(2, linkTextSpans.size());
+
+ DocumentKeywordsBuilder.DocumentWordSpan span;
+ span = linkTextSpans.get(0);
+
+ assertEquals(6, span.start());
+ assertEquals(7, span.end());
+
+ span = linkTextSpans.get(1);
+
+ assertEquals(9, span.start());
+ assertEquals(10, span.end());
+ }
+
+ @Test
+ public void testLinksTwoWords2Reps() {
+ DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
+
+ var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
+ assertEquals(1, sentences.size());
+ TIntList counts = new TIntArrayList(new int[] { 4 });
+
+ positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
+ new LinkTexts(sentences, counts));
+
+ assertEquals(IntList.of(6, 10), keywordsBuilder.wordToPos.get("zelda"));
+ assertEquals(IntList.of(7, 11), keywordsBuilder.wordToPos.get("ii"));
+
+ var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
+ assertEquals(2, linkTextSpans.size());
+
+ DocumentKeywordsBuilder.DocumentWordSpan span;
+ span = linkTextSpans.get(0);
+
+ assertEquals(6, span.start());
+ assertEquals(8, span.end());
+
+ span = linkTextSpans.get(1);
+
+ assertEquals(10, span.start());
+ assertEquals(12, span.end());
+ }
+
+
+ @Test
+ public void testLinksTwoSent1Word1Rep() {
+ DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
+
+ var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
+ var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
+ assertEquals(1, sentences1.size());
+ assertEquals(1, sentences2.size());
+ TIntList counts = new TIntArrayList(new int[] { 1, 1 });
+
+ List sentencesAll = new ArrayList<>();
+ sentencesAll.addAll(sentences1);
+ sentencesAll.addAll(sentences2);
+
+ positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
+ new LinkTexts(sentencesAll, counts));
+
+ assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
+ assertEquals(IntList.of(9), keywordsBuilder.wordToPos.get("link"));
+
+ var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
+ assertEquals(2, linkTextSpans.size());
+
+ DocumentKeywordsBuilder.DocumentWordSpan span;
+ span = linkTextSpans.get(0);
+
+ assertEquals(6, span.start());
+ assertEquals(7, span.end());
+
+ span = linkTextSpans.get(1);
+
+ assertEquals(9, span.start());
+ assertEquals(10, span.end());
+ }
+
+
+}
\ No newline at end of file
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java
index 2a4fbcb1..2eb073b9 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java
@@ -2,6 +2,7 @@ package nu.marginalia.converting.processor;
import nu.marginalia.converting.model.ProcessedDocument;
+import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
@@ -14,6 +15,9 @@ public class DocumentDecorator {
public void addTerm(String term) {
extraSearchTerms.add(term);
}
+ public void addTerms(Collection terms) {
+ extraSearchTerms.addAll(terms);
+ }
public void apply(ProcessedDocument doc) {
if (doc == null)
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java
index 36eae72a..d1e4d495 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java
@@ -15,6 +15,7 @@ import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
+import nu.marginalia.model.idx.WordFlags;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -118,6 +119,10 @@ public class DocumentProcessor {
ret.details = detailsWithWords.details();
ret.words = detailsWithWords.words();
+ if (url.path.equals("/")) {
+ ret.words.addMeta("special:root", WordFlags.Synthetic.asBit());
+ }
+
documentDecorator.apply(ret);
if (Boolean.TRUE.equals(crawledDocument.hasCookies)
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java
index c0999c96..d31195f8 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java
@@ -66,6 +66,16 @@ public class DomainProcessor {
return fullProcessing(domain);
}
+ public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) {
+ try {
+ return new SideloadProcessing(dataStream, sizeHint, extraKeywords);
+ }
+ catch (Exception ex) {
+ logger.warn("Failed to process domain sideload", ex);
+ return null;
+ }
+ }
+
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) {
try {
return new SideloadProcessing(dataStream, sizeHint);
@@ -74,7 +84,6 @@ public class DomainProcessor {
logger.warn("Failed to process domain sideload", ex);
return null;
}
-
}
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
@@ -89,6 +98,10 @@ public class DomainProcessor {
);
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
+ this(dataStream, sizeHint, List.of());
+ }
+
+ SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) throws IOException {
this.dataStream = dataStream;
if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
@@ -100,6 +113,7 @@ public class DomainProcessor {
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
documentDecorator = new DocumentDecorator();
+ documentDecorator.addTerms(extraKeywords);
processDomain(crawledDomain, domain, documentDecorator);
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java
index c3c9eac4..e6a87089 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java
@@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.model.GeneratorType;
+import nu.marginalia.model.EdgeUrl;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
@@ -13,7 +14,12 @@ import java.util.List;
public class DocumentGeneratorExtractor {
private static final String defaultValue = "unset";
- public DocumentGenerator detectGenerator(Document doc, DocumentHeaders responseHeaders) {
+ public DocumentGenerator detectGenerator(EdgeUrl url, Document doc, DocumentHeaders responseHeaders) {
+
+ // Fextralife leaves no known tech fingerprint, but we know it's a wiki software of some sort
+ if (url.domain.toString().endsWith(".wiki.fextralife.com")) {
+ return DocumentGenerator.of("wiki");
+ }
var tags = doc.select("meta[name=generator]");
@@ -57,6 +63,7 @@ public class DocumentGeneratorExtractor {
case "one.com":
case "wix.com":
case "wpbakery":
+ case "FluxGarden":
return DocumentGenerator.of(parts[0]);
case "adobe":
case "microsoft":
@@ -68,6 +75,7 @@ public class DocumentGeneratorExtractor {
}
}
+
if (parts.length > 1) {
return DocumentGenerator.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
}
@@ -183,7 +191,7 @@ public class DocumentGeneratorExtractor {
return DocumentGenerator.of("apache");
}
if (header.contains("server: cowboy")) {
- return DocumentGenerator.of("cowboy"); // erlang, really?!
+ return DocumentGenerator.of("cowboy"); // erlang, apparently
}
}
@@ -281,7 +289,7 @@ public class DocumentGeneratorExtractor {
-> GeneratorType.FORUM;
case "mediawiki", "dokuwiki", "wikidot", "sharepoint"
-> GeneratorType.WIKI;
- case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc"
+ case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden", "wiki"
-> GeneratorType.DOCS;
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass"
-> GeneratorType.ECOMMERCE_AND_SPAM;
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
index 09b4a360..e27d0f68 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
@@ -129,7 +129,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers);
- final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, documentHeaders);
+ final var generatorParts = documentGeneratorExtractor.detectGenerator(url, doc, documentHeaders);
final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
index 8c6e92d2..f3c6227d 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
@@ -65,8 +65,7 @@ public class SideloadSourceFactory {
public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException {
return sideload(pathToDbFiles,
new PathSuffixPredicate(".db"),
- (List paths) -> new RedditSideloader(paths,
- anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing));
+ (List paths) -> new RedditSideloader(paths, anchorTextKeywords, sideloaderProcessing));
}
public Collection extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java
index 61ccf09f..61fc9e32 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java
@@ -2,7 +2,6 @@ package nu.marginalia.converting.sideload.reddit;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks;
-import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
@@ -13,7 +12,7 @@ import nu.marginalia.integration.reddit.db.RedditDb;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
-import nu.marginalia.model.idx.WordFlags;
+import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.util.ProcessingIterator;
import org.apache.commons.lang3.StringUtils;
@@ -30,16 +29,13 @@ public class RedditSideloader implements SideloadSource {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
private final List dbFiles;
- private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final AnchorTextKeywords anchorTextKeywords;
private final SideloaderProcessing sideloaderProcessing;
public RedditSideloader(List listToDbFiles,
- AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
SideloaderProcessing sideloaderProcessing) {
this.dbFiles = listToDbFiles;
- this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.anchorTextKeywords = anchorTextKeywords;
this.sideloaderProcessing = sideloaderProcessing;
}
@@ -116,14 +112,25 @@ public class RedditSideloader implements SideloadSource {
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
.getYear();
- String fullHtml = "\n\n\n " + title + "\n \n\n\n " + title + "
\n \n " + body + "
\n \n\n\n";
+ String fullHtml = """
+
+
+
+ %s
+
+
+
+ %s
+ reddit r/%s %s
+
+ %s
+
+
+
+ """.formatted(title, title, subreddit, subreddit, body);
List extraKeywords = new ArrayList<>();
- extraKeywords.add("reddit");
- extraKeywords.add(subreddit);
- extraKeywords.add("r/" + subreddit);
-
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
extraKeywords.add(author);
}
@@ -147,12 +154,18 @@ public class RedditSideloader implements SideloadSource {
if (doc.isProcessedFully()) {
- for (var keyword : extraKeywords) {
- doc.words.addMeta(keyword, WordFlags.Subjects.asBit());
+ // Insert topology information
+ if (doc.details != null) {
+ doc.details.metadata.withSizeAndTopology(50_000_000, score);
}
- // Insert topology information
- doc.details.metadata.withSizeAndTopology(50_000_000, score);
+ if (doc.words != null) {
+ doc.words.addAllSyntheticTerms(List.of("generator:forum",
+ HtmlFeature.COOKIES.getKeyword(),
+ HtmlFeature.JS.getKeyword(),
+ HtmlFeature.TRACKING_ADTECH.getKeyword()
+ ));
+ }
}
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
index bf4d21f1..c42443b3 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
@@ -24,10 +24,7 @@ import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import java.nio.file.Path;
-import java.util.Arrays;
-import java.util.EnumSet;
-import java.util.Iterator;
-import java.util.List;
+import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
@@ -36,6 +33,8 @@ public class StackexchangeSideloader implements SideloadSource {
private final DocumentKeywordExtractor keywordExtractor;
private final String domainName;
+ private final EnumSet applyFeatures = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
+
private final Path dbFile;
public StackexchangeSideloader(Path pathToDbFile,
@@ -133,12 +132,17 @@ public class StackexchangeSideloader implements SideloadSource {
ret.url = url;
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
- ret.words.addAllSyntheticTerms(List.of(
- "site:" + domainName,
- "site:" + url.domain.topDomain,
- url.domain.topDomain,
- domainName
- ));
+
+ List syntheticTerms = new ArrayList<>(
+ List.of("site:" + domainName,
+ "site:" + url.domain.topDomain,
+ url.domain.topDomain,
+ domainName)
+ );
+ for (HtmlFeature feature : applyFeatures) {
+ syntheticTerms.add(feature.getKeyword());
+ }
+ ret.words.addAllSyntheticTerms(syntheticTerms);
if (!post.tags().isBlank()) {
List subjects = Arrays.asList(post.tags().split(","));
@@ -152,7 +156,7 @@ public class StackexchangeSideloader implements SideloadSource {
PubDate.toYearByte(ret.details.pubYear),
(int) -ret.details.quality,
EnumSet.of(DocumentFlags.GeneratorDocs));
- ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
+ ret.details.features = applyFeatures;
ret.details.metadata.withSizeAndTopology(10000, 0);
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java
index 785318d9..9dba2444 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java
@@ -12,6 +12,7 @@ import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.model.processed.SlopDomainLinkRecord;
import nu.marginalia.model.processed.SlopDomainRecord;
import nu.marginalia.sequence.VarintCodedSequence;
+import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -32,20 +33,26 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);
public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
- if (!Files.exists(ProcessedDataFileNames.domainFileName(basePath))) {
- Files.createDirectory(ProcessedDataFileNames.domainFileName(basePath));
- }
- domainWriter = new SlopDomainRecord.Writer(ProcessedDataFileNames.domainFileName(basePath), batchNumber);
+ Path domainPath = initSlopDir(ProcessedDataFileNames.domainFileName(basePath));
+ Path linksPath = initSlopDir(ProcessedDataFileNames.domainLinkFileName(basePath));
+ Path docsPath = initSlopDir(ProcessedDataFileNames.documentFileName(basePath));
- if (!Files.exists(ProcessedDataFileNames.domainLinkFileName(basePath))) {
- Files.createDirectory(ProcessedDataFileNames.domainLinkFileName(basePath));
- }
- domainLinkWriter = new SlopDomainLinkRecord.Writer(ProcessedDataFileNames.domainLinkFileName(basePath), batchNumber);
+ domainWriter = new SlopDomainRecord.Writer(domainPath, batchNumber);
+ domainLinkWriter = new SlopDomainLinkRecord.Writer(linksPath, batchNumber);
+ documentWriter = new SlopDocumentRecord.Writer(docsPath, batchNumber);
+ }
- if (!Files.exists(ProcessedDataFileNames.documentFileName(basePath))) {
- Files.createDirectory(ProcessedDataFileNames.documentFileName(basePath));
+ private Path initSlopDir(Path p) throws IOException {
+ if (Files.isDirectory(p)) {
+ FileUtils.deleteDirectory(p.toFile());
}
- documentWriter = new SlopDocumentRecord.Writer(ProcessedDataFileNames.documentFileName(basePath), batchNumber);
+ else if (Files.exists(p)) {
+ Files.delete(p);
+ }
+
+ Files.createDirectories(p);
+
+ return p;
}
/** Sets the lowest ordinal value for the documents in this batch */
@@ -114,7 +121,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
documentWriter.write(new SlopDocumentRecord(
domainName,
document.url.toString(),
- ordinal,
+ ordinal++,
document.state.toString(),
document.stateReason,
document.details.title,
@@ -132,17 +139,15 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
spanCodes,
spanSequences
));
-
- ordinal++;
}
}
- private Object writeLinkData(ProcessedDomain domain) throws IOException {
+ private void writeLinkData(ProcessedDomain domain) throws IOException {
String from = domain.domain.toString();
if (domain.documents == null)
- return this;
+ return;
Set seen = new HashSet<>();
@@ -171,10 +176,9 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
));
}
- return this;
}
- public Object writeDomainData(ProcessedDomain domain) throws IOException {
+ public void writeDomainData(ProcessedDomain domain) throws IOException {
DomainMetadata metadata = DomainMetadata.from(domain);
List feeds = getFeedUrls(domain);
@@ -191,8 +195,6 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
feeds
)
);
-
- return this;
}
private List getFeedUrls(ProcessedDomain domain) {
diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java
index 253fc673..1b162790 100644
--- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java
@@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
+import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
@@ -34,8 +35,8 @@ class JavadocSpecializationTest {
}
@Test
- void generatorExtraction() {
- var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
+ void generatorExtraction() throws Exception {
+ var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders(""));
System.out.println(gen);
}
diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java
index 178796df..77d3fc05 100644
--- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java
@@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
+import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
+import java.net.URISyntaxException;
import java.util.Set;
class LemmySpecializationTest {
@@ -37,9 +39,9 @@ class LemmySpecializationTest {
}
@Test
- void generatorExtraction() {
- var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
- var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), new DocumentHeaders(""));
+ void generatorExtraction() throws URISyntaxException {
+ var generatorIndex = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
+ var generatorPost = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyPost), new DocumentHeaders(""));
System.out.println(generatorIndex);
System.out.println(generatorPost);
diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java
index 3efd2900..c4005c06 100644
--- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java
@@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
+import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
+import java.net.URISyntaxException;
import java.util.Set;
class XenForoSpecializationTest {
@@ -34,8 +36,8 @@ class XenForoSpecializationTest {
}
@Test
- void generatorExtraction() {
- var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
+ void generatorExtraction() throws URISyntaxException {
+ var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders(""));
System.out.println(gen);
}
diff --git a/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java b/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java
index 8a04863d..717ae8a5 100644
--- a/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java
+++ b/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java
@@ -42,7 +42,8 @@ public class LinkParser {
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
- .flatMap(this::createEdgeUrl);
+ .flatMap(this::createEdgeUrl)
+ .filter(url -> !hasBinarySuffix(url.path));
}
@Contract(pure=true)
diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
index b0b2c014..98133bcf 100644
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@@ -1,5 +1,6 @@
package nu.marginalia.crawl.retreival;
+import nu.marginalia.ContentTypes;
import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.lsh.EasyLSH;
import nu.marginalia.model.crawldata.CrawledDocument;
@@ -43,6 +44,9 @@ public class CrawlDataReference implements AutoCloseable {
try {
while (data.hasNext()) {
if (data.next() instanceof CrawledDocument doc) {
+ if (!ContentTypes.isAccepted(doc.contentType))
+ continue;
+
return doc;
}
}
diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
index c6b426b3..ace2059b 100644
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -317,26 +317,24 @@ public class CrawlerRetreiver implements AutoCloseable {
long probeStart = System.currentTimeMillis();
- /*
- probing is on probation for now while we evaluate how much the added delays slows down the crawler
-
if (probeType == HttpFetcher.ProbeType.FULL) {
+ retryLoop:
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try {
var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
- if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Ok ok) {
- url = ok.resolvedUrl(); // If we were redirected while probing, use the final URL for fetching
- break;
- } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType badContentType) {
- return new HttpFetchResult.ResultNone();
- } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout) {
- return new HttpFetchResult.ResultException(timeout.ex());
- } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Exception exception) {
- return new HttpFetchResult.ResultException(exception.ex());
- }
- else { // should be unreachable
- throw new IllegalStateException("Unknown probe result");
+ switch (probeResult) {
+ case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
+ url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
+ break retryLoop;
+ case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType:
+ return new HttpFetchResult.ResultNone();
+ case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout:
+ return new HttpFetchResult.ResultException(timeout.ex());
+ case HttpFetcher.ContentTypeProbeResult.Exception exception:
+ return new HttpFetchResult.ResultException(exception.ex());
+ default: // should be unreachable
+ throw new IllegalStateException("Unknown probe result");
}
}
catch (HttpFetcherImpl.RateLimitException ex) {
@@ -348,8 +346,8 @@ public class CrawlerRetreiver implements AutoCloseable {
}
}
- timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
- }*/
+ timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
+ }
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
diff --git a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
new file mode 100644
index 00000000..dbc1989c
--- /dev/null
+++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
@@ -0,0 +1,22 @@
+package nu.marginalia;
+
+import java.util.Set;
+
+public class ContentTypes {
+ public static final Set acceptedContentTypes = Set.of("application/xhtml+xml",
+ "application/xhtml",
+ "text/html",
+ "image/x-icon",
+ "text/plain");
+
+ public static boolean isAccepted(String contentTypeHeader) {
+ String lcHeader = contentTypeHeader.toLowerCase();
+ for (var type : acceptedContentTypes) {
+ if (lcHeader.startsWith(type)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+}
diff --git a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
index f231c703..9474c2ff 100644
--- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
@@ -1,6 +1,7 @@
package nu.marginalia.parquet.crawldata;
import blue.strategic.parquet.ParquetWriter;
+import nu.marginalia.ContentTypes;
import nu.marginalia.UserAgent;
import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.DocumentBodyResult;
@@ -62,6 +63,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
}
}
+
+
/** Return true if the WarcResponse should be excluded from conversion */
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
@@ -74,14 +77,25 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
return false;
}
- var robotsTags = response.http().headers().all("X-Robots-Tag");
+ var headers = response.http().headers();
+ var robotsTags = headers.all("X-Robots-Tag");
+
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
return false;
}
+ // Strip out responses with content types we aren't interested in
+ // (though ideally we wouldn't download these at all)
+ String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();
+
+ if (!ContentTypes.isAccepted(contentType)) {
+ return false;
+ }
+
return true;
}
+
private void write(String domain, WarcXEntityRefused refused) throws IOException {
URI profile = refused.profile();
diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
index d6d407bf..b2a0f2bc 100644
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
@@ -157,10 +157,10 @@ class WarcRecorderTest {
fileNameParquet);
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
- assertEquals(3, urls.size());
+ assertEquals(2, urls.size());
assertEquals("https://www.marginalia.nu/", urls.get(0));
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
- assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2));
+ // sanic.jpg gets filtered out for its bad mime type
}
diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java
index d05925bb..f8af9267 100644
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java
@@ -41,6 +41,7 @@ import java.time.temporal.ChronoUnit;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX;
@@ -196,7 +197,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
writer.setOrdinalOffset(67_000_000);
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
- writer.write(domainProcessor.sideloadProcessing(stream, 0));
+ writer.write(domainProcessor.sideloadProcessing(stream, 0, Set.of("special:live")));
}
}
diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java
index 537d6869..e7ceb519 100644
--- a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java
+++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java
@@ -75,7 +75,6 @@ public class DocumentLoaderService {
public void accept(SlopDocumentRecord.MetadataProjection projection)
{
-
long urlId = UrlIdCodec.encodeId(
domainIdRegistry.getDomainId(projection.domain()),
projection.ordinal()
@@ -88,7 +87,7 @@ public class DocumentLoaderService {
}
try {
- documentDbWriter.add(new DocdbUrlDetail(
+ details.add(new DocdbUrlDetail(
urlId,
parsedUrl.get(),
projection.title(),