mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Merge branch 'master' into serp-redesign
This commit is contained in:
commit
eab61cd48a
@ -69,7 +69,7 @@ public class ResultRankingParameters {
|
|||||||
.bm25Params(new Bm25Parameters(1.2, 0.5))
|
.bm25Params(new Bm25Parameters(1.2, 0.5))
|
||||||
.shortDocumentThreshold(2000)
|
.shortDocumentThreshold(2000)
|
||||||
.shortDocumentPenalty(2.)
|
.shortDocumentPenalty(2.)
|
||||||
.domainRankBonus(1 / 25.)
|
.domainRankBonus(1 / 100.)
|
||||||
.qualityPenalty(1 / 15.)
|
.qualityPenalty(1 / 15.)
|
||||||
.shortSentenceThreshold(2)
|
.shortSentenceThreshold(2)
|
||||||
.shortSentencePenalty(5)
|
.shortSentencePenalty(5)
|
||||||
|
@ -123,13 +123,13 @@ public class DocumentSpan {
|
|||||||
|
|
||||||
/** Returns true if for any position in the list, there exists a range
|
/** Returns true if for any position in the list, there exists a range
|
||||||
* (position[i], position[i]+len] that is overlapped by a span */
|
* (position[i], position[i]+len] that is overlapped by a span */
|
||||||
public boolean containsRangeExact(IntList positions, int len) {
|
public int containsRangeExact(IntList positions, int len) {
|
||||||
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
|
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
|
||||||
return false;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int sei = 0;
|
int sei = 0;
|
||||||
|
int cnt = 0;
|
||||||
|
|
||||||
int start = startsEnds.getInt(sei++);
|
int start = startsEnds.getInt(sei++);
|
||||||
int end = startsEnds.getInt(sei++);
|
int end = startsEnds.getInt(sei++);
|
||||||
@ -138,7 +138,15 @@ public class DocumentSpan {
|
|||||||
int position = positions.getInt(pi);
|
int position = positions.getInt(pi);
|
||||||
|
|
||||||
if (position == start && position + len == end) {
|
if (position == start && position + len == end) {
|
||||||
return true;
|
cnt++;
|
||||||
|
if (sei + 2 <= startsEnds.size()) {
|
||||||
|
pi = 0;
|
||||||
|
start = startsEnds.getInt(sei++);
|
||||||
|
end = startsEnds.getInt(sei++);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (position < end) {
|
else if (position < end) {
|
||||||
pi++;
|
pi++;
|
||||||
@ -147,11 +155,11 @@ public class DocumentSpan {
|
|||||||
end = startsEnds.getInt(sei++);
|
end = startsEnds.getInt(sei++);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return false;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return cnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int countRangeMatches(IntList positions, int len) {
|
public int countRangeMatches(IntList positions, int len) {
|
||||||
|
@ -115,16 +115,16 @@ class ForwardIndexSpansReaderTest {
|
|||||||
) {
|
) {
|
||||||
var spans1 = reader.readSpans(arena, offset1);
|
var spans1 = reader.readSpans(arena, offset1);
|
||||||
|
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(10), 2));
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10), 2));
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2));
|
||||||
|
|
||||||
assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5));
|
assertEquals(1, spans1.heading.containsRangeExact(IntList.of(10), 5));
|
||||||
assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
|
assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10), 5));
|
||||||
assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
|
assertEquals(1, spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5));
|
||||||
|
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(11), 5));
|
||||||
assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5));
|
assertEquals(0, spans1.heading.containsRangeExact(IntList.of(9), 5));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -129,9 +129,9 @@ public class IndexResultScoreCalculator {
|
|||||||
double score = normalize(
|
double score = normalize(
|
||||||
score_firstPosition + score_proximity + score_verbatim
|
score_firstPosition + score_proximity + score_verbatim
|
||||||
+ score_bM25
|
+ score_bM25
|
||||||
+ score_bFlags
|
+ score_bFlags,
|
||||||
+ Math.max(0, documentBonus),
|
-Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
|
||||||
-Math.min(0, documentBonus));
|
);
|
||||||
|
|
||||||
if (Double.isNaN(score)) { // This should never happen but if it does, we want to know about it
|
if (Double.isNaN(score)) { // This should never happen but if it does, we want to know about it
|
||||||
if (getClass().desiredAssertionStatus()) {
|
if (getClass().desiredAssertionStatus()) {
|
||||||
@ -388,11 +388,13 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
if (extLinkSpan.length() == fullGroup.size
|
if (extLinkSpan.length() >= fullGroup.size) {
|
||||||
&& extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
|
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
|
||||||
{
|
if (cnt > 0) {
|
||||||
score += 2; // Add additional bonus if there's a single-word atag span
|
score += 2 * cnt;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -407,9 +409,9 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
// Bonus if there's a perfect match with an atag span
|
// Bonus if there's a perfect match with an atag span
|
||||||
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
if (extLinkSpan.length() == fullGroup.size && extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size))
|
if (extLinkSpan.length() >= fullGroup.size) {
|
||||||
{
|
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
|
||||||
score += 2;
|
score += 2*cnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
// For optional groups, we scale the score by the size of the group relative to the full group
|
// For optional groups, we scale the score by the size of the group relative to the full group
|
||||||
@ -420,7 +422,7 @@ public class IndexResultScoreCalculator {
|
|||||||
IntList intersections = optionalGroup.findIntersections(positions);
|
IntList intersections = optionalGroup.findIntersections(positions);
|
||||||
|
|
||||||
for (var tag : HtmlTag.includedTags) {
|
for (var tag : HtmlTag.includedTags) {
|
||||||
int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);;
|
int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
|
||||||
if (cnts > 0) {
|
if (cnts > 0) {
|
||||||
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
|
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
|
||||||
}
|
}
|
||||||
@ -452,12 +454,12 @@ public class IndexResultScoreCalculator {
|
|||||||
for (int i = 0; i < weights.length; i++) {
|
for (int i = 0; i < weights.length; i++) {
|
||||||
weights[i] = switch(HtmlTag.includedTags[i]) {
|
weights[i] = switch(HtmlTag.includedTags[i]) {
|
||||||
case TITLE -> 2.5f;
|
case TITLE -> 2.5f;
|
||||||
case HEADING -> 2.5f;
|
case HEADING -> 1.25f;
|
||||||
case ANCHOR -> 0.2f;
|
case ANCHOR -> 0.2f;
|
||||||
case NAV -> 0.1f;
|
case NAV -> 0.1f;
|
||||||
case CODE -> 0.25f;
|
case CODE -> 0.25f;
|
||||||
case BODY -> 1.0f;
|
case BODY -> 1.0f;
|
||||||
case EXTERNAL_LINKTEXT -> 0.75f;
|
case EXTERNAL_LINKTEXT -> 1.5f;
|
||||||
default -> 0.0f;
|
default -> 0.0f;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.atags;
|
package nu.marginalia.atags;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import gnu.trove.list.TIntList;
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.atags.model.Link;
|
import nu.marginalia.atags.model.Link;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
@ -51,6 +53,7 @@ public class AnchorTextKeywords {
|
|||||||
List<Link> keywordsRaw = links.forUrl(url);
|
List<Link> keywordsRaw = links.forUrl(url);
|
||||||
|
|
||||||
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
|
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
|
||||||
|
TIntList counts = new TIntArrayList(keywordsRaw.size());
|
||||||
|
|
||||||
// Extract and count keywords from anchor text
|
// Extract and count keywords from anchor text
|
||||||
for (Link keyword : keywordsRaw) {
|
for (Link keyword : keywordsRaw) {
|
||||||
@ -59,18 +62,20 @@ public class AnchorTextKeywords {
|
|||||||
|
|
||||||
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
ret.add(sentence);
|
ret.add(sentence);
|
||||||
|
counts.add(keyword.count());
|
||||||
}
|
}
|
||||||
|
|
||||||
return new LinkTexts(ret);
|
return new LinkTexts(ret, counts);
|
||||||
}
|
}
|
||||||
|
|
||||||
public LinkTexts getAnchorTextKeywords(DomainLinks links, List<EdgeUrl> urls) {
|
public LinkTexts getAnchorTextKeywords(DomainLinks links, List<EdgeUrl> urls) {
|
||||||
List<Link> keywordsRaw = new ArrayList<>();
|
List<Link> keywordsRaw = new ArrayList<>();
|
||||||
for (var url : urls) {
|
for (var url : urls) {
|
||||||
links.forUrl(url);
|
keywordsRaw.addAll(links.forUrl(url));
|
||||||
}
|
}
|
||||||
|
|
||||||
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
|
List<DocumentSentence> ret = new ArrayList<>(keywordsRaw.size());
|
||||||
|
TIntList counts = new TIntArrayList(keywordsRaw.size());
|
||||||
|
|
||||||
// Extract and count keywords from anchor text
|
// Extract and count keywords from anchor text
|
||||||
for (Link keyword : keywordsRaw) {
|
for (Link keyword : keywordsRaw) {
|
||||||
@ -79,8 +84,9 @@ public class AnchorTextKeywords {
|
|||||||
|
|
||||||
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
ret.add(sentence);
|
ret.add(sentence);
|
||||||
|
counts.add(keyword.count());
|
||||||
}
|
}
|
||||||
|
|
||||||
return new LinkTexts(ret);
|
return new LinkTexts(ret, counts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,7 +41,13 @@ public class DomainLinks {
|
|||||||
/** Returns the number of links to the given url. */
|
/** Returns the number of links to the given url. */
|
||||||
public int countForUrl(EdgeUrl url) {
|
public int countForUrl(EdgeUrl url) {
|
||||||
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
|
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
|
||||||
return links.getOrDefault(key, List.of()).size();
|
|
||||||
|
int cnt = 0;
|
||||||
|
for (var link : links.getOrDefault(key, List.of())) {
|
||||||
|
cnt += link.count();
|
||||||
|
}
|
||||||
|
|
||||||
|
return cnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.atags.model;
|
package nu.marginalia.atags.model;
|
||||||
|
|
||||||
public record Link(String source, String text) {
|
public record Link(String text, int count) {
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.atags.model;
|
package nu.marginalia.atags.model;
|
||||||
|
|
||||||
public record LinkWithText(String url, String text, String source) {
|
public record LinkWithText(String url, String text, int cnt) {
|
||||||
public Link toLink() {
|
public Link toLink() {
|
||||||
return new Link(source, text);
|
return new Link(text, cnt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -80,7 +80,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
|
|||||||
select
|
select
|
||||||
unnest(text) as 'text',
|
unnest(text) as 'text',
|
||||||
unnest(url) as 'url',
|
unnest(url) as 'url',
|
||||||
unnest(source) as 'source'
|
unnest(cnt) as 'cnt'
|
||||||
from atags
|
from atags
|
||||||
where dest = ?
|
where dest = ?
|
||||||
"""))
|
"""))
|
||||||
@ -89,7 +89,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
|
|||||||
ps.setString(1, domain.toString());
|
ps.setString(1, domain.toString());
|
||||||
var rs = ps.executeQuery();
|
var rs = ps.executeQuery();
|
||||||
while (rs.next()) {
|
while (rs.next()) {
|
||||||
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source")));
|
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getInt("cnt")));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
|
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
|
||||||
@ -102,7 +102,7 @@ public class AnchorTagsImpl implements AnchorTagsSource {
|
|||||||
String url = rs.getString("url");
|
String url = rs.getString("url");
|
||||||
url = aliasDomain + url.substring(url.indexOf('/'));
|
url = aliasDomain + url.substring(url.indexOf('/'));
|
||||||
|
|
||||||
links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source")));
|
links.add(new LinkWithText(url, rs.getString("text"), rs.getInt("cnt")));
|
||||||
}
|
}
|
||||||
return new DomainLinks(links);
|
return new DomainLinks(links);
|
||||||
}
|
}
|
||||||
|
@ -5,35 +5,29 @@ import nu.marginalia.WmsaHome;
|
|||||||
import nu.marginalia.keyword.extractors.*;
|
import nu.marginalia.keyword.extractors.*;
|
||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.language.model.DocumentSentence;
|
|
||||||
import nu.marginalia.language.model.WordRep;
|
|
||||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
public class DocumentKeywordExtractor {
|
public class DocumentKeywordExtractor {
|
||||||
|
|
||||||
private final KeywordExtractor keywordExtractor;
|
|
||||||
private final TermFrequencyDict dict;
|
private final TermFrequencyDict dict;
|
||||||
|
|
||||||
|
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||||
|
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
this.keywordExtractor = new KeywordExtractor();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// for tests
|
// for tests
|
||||||
public DocumentKeywordExtractor() {
|
public DocumentKeywordExtractor() {
|
||||||
try {
|
try {
|
||||||
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||||
this.keywordExtractor = new KeywordExtractor();
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
throw new RuntimeException(ex);
|
throw new RuntimeException(ex);
|
||||||
@ -60,7 +54,7 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
createSimpleWords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||||
|
|
||||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||||
@ -106,176 +100,4 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
|
||||||
KeywordMetadata metadata,
|
|
||||||
DocumentLanguageData dld,
|
|
||||||
LinkTexts linkTexts)
|
|
||||||
{
|
|
||||||
// we use 1-based indexing since the data
|
|
||||||
// will be gamma encoded, and it can't represent 0
|
|
||||||
int pos = 0;
|
|
||||||
|
|
||||||
List<SpanRecorder> spanRecorders = new ArrayList<>();
|
|
||||||
for (var htmlTag : HtmlTag.includedTags) {
|
|
||||||
if (!htmlTag.exclude) {
|
|
||||||
spanRecorders.add(new SpanRecorder(htmlTag));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (DocumentSentence sent : dld) {
|
|
||||||
for (var word : sent) {
|
|
||||||
pos++;
|
|
||||||
|
|
||||||
for (var recorder : spanRecorders) {
|
|
||||||
recorder.update(sent, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (word.isStopWord()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
String w = word.wordLowerCase();
|
|
||||||
if (matchesWordPattern(w)) {
|
|
||||||
/* Add information about term positions */
|
|
||||||
wordsBuilder.addPos(w, pos);
|
|
||||||
|
|
||||||
/* Add metadata for word */
|
|
||||||
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
|
||||||
var rep = new WordRep(sent, names);
|
|
||||||
|
|
||||||
byte meta = metadata.getMetadataForWord(rep.stemmed);
|
|
||||||
|
|
||||||
wordsBuilder.addMeta(rep.word, meta);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pos++; // we need to add one more position to account for the last word in the document
|
|
||||||
|
|
||||||
for (var recorder : spanRecorders) {
|
|
||||||
wordsBuilder.addSpans(recorder.finish(pos));
|
|
||||||
|
|
||||||
// reset the recorder, so we can use it again without adding the same positions twice
|
|
||||||
recorder.reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Next add synthetic positions to the document for anchor texts
|
|
||||||
|
|
||||||
pos += 2; // add some padding to the end of the document before we start adding a-tag words
|
|
||||||
|
|
||||||
for (var linkText : linkTexts) {
|
|
||||||
|
|
||||||
for (var word : linkText) {
|
|
||||||
pos++;
|
|
||||||
|
|
||||||
for (var recorder : spanRecorders) {
|
|
||||||
recorder.update(linkText, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (word.isStopWord()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
String w = word.wordLowerCase();
|
|
||||||
if (matchesWordPattern(w)) {
|
|
||||||
/* Add information about term positions */
|
|
||||||
wordsBuilder.addPos(w, pos);
|
|
||||||
|
|
||||||
/* Add metadata for word */
|
|
||||||
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// add some padding between separate link texts so we don't match across their boundaries
|
|
||||||
pos+=2;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var recorder : spanRecorders) {
|
|
||||||
wordsBuilder.addSpans(recorder.finish(pos));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean matchesWordPattern(String s) {
|
|
||||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
|
||||||
|
|
||||||
String wordPartSeparator = ".-_/:+*";
|
|
||||||
|
|
||||||
int i = 0;
|
|
||||||
|
|
||||||
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
|
||||||
char c = s.charAt(i);
|
|
||||||
if (c >= 'a' && c <= 'z') continue;
|
|
||||||
if (c >= 'A' && c <= 'Z') continue;
|
|
||||||
if (c >= '0' && c <= '9') continue;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i == 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (int j = 0; j < 5; j++) {
|
|
||||||
if (i == s.length()) return true;
|
|
||||||
|
|
||||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
i++;
|
|
||||||
|
|
||||||
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
|
||||||
char c = s.charAt(i);
|
|
||||||
if (c >= 'a' && c <= 'z') continue;
|
|
||||||
if (c >= 'A' && c <= 'Z') continue;
|
|
||||||
if (c >= '0' && c <= '9') continue;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Helper class to record spans of words */
|
|
||||||
private static class SpanRecorder {
|
|
||||||
private List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
|
|
||||||
private final HtmlTag htmlTag;
|
|
||||||
private int start = 0;
|
|
||||||
|
|
||||||
public SpanRecorder(HtmlTag htmlTag) {
|
|
||||||
this.htmlTag = htmlTag;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void update(DocumentSentence sentence, int pos) {
|
|
||||||
assert pos > 0;
|
|
||||||
|
|
||||||
if (
|
|
||||||
sentence.htmlTags.contains(htmlTag)
|
|
||||||
|| (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) // special case for body tag, we match against no tag on the sentence
|
|
||||||
)
|
|
||||||
{
|
|
||||||
if (start <= 0) start = pos;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (start > 0) {
|
|
||||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
|
||||||
start = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
|
||||||
if (start > 0) {
|
|
||||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
|
||||||
start = 0;
|
|
||||||
}
|
|
||||||
return spans;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void reset() {
|
|
||||||
spans.clear();
|
|
||||||
start = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,237 @@
|
|||||||
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
|
import nu.marginalia.language.model.WordRep;
|
||||||
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static java.lang.Math.min;
|
||||||
|
import static java.lang.Math.sqrt;
|
||||||
|
|
||||||
|
/** DocumentPositionMapper is responsible for assigning keywords positions in the document,
|
||||||
|
* as well as recording spans of positions
|
||||||
|
*/
|
||||||
|
public class DocumentPositionMapper {
|
||||||
|
|
||||||
|
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||||
|
|
||||||
|
public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
|
||||||
|
KeywordMetadata metadata,
|
||||||
|
DocumentLanguageData dld,
|
||||||
|
LinkTexts linkTexts)
|
||||||
|
{
|
||||||
|
|
||||||
|
// First map the words in the documnent to their positions
|
||||||
|
int pos = mapDocumentPositions(wordsBuilder, metadata, dld);
|
||||||
|
|
||||||
|
// Next create some padding space to avoid cross-matching
|
||||||
|
pos += 2;
|
||||||
|
|
||||||
|
// Finally allocate some virtual space after the end of the document
|
||||||
|
// for the link texts, so that we can match against them as well, although
|
||||||
|
// these will be given a different span type.
|
||||||
|
mapLinkTextPositions(pos, wordsBuilder, metadata, linkTexts);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||||
|
KeywordMetadata metadata,
|
||||||
|
DocumentLanguageData dld)
|
||||||
|
|
||||||
|
{
|
||||||
|
|
||||||
|
List<SpanRecorder> spanRecorders = new ArrayList<>();
|
||||||
|
for (var htmlTag : HtmlTag.includedTags) {
|
||||||
|
if (!htmlTag.exclude) {
|
||||||
|
spanRecorders.add(new SpanRecorder(htmlTag));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we use 1-based indexing since the data
|
||||||
|
// will be gamma encoded, and it can't represent 0;
|
||||||
|
// but the loop starts by incrementing the position,
|
||||||
|
// so while unintuitive, zero is correct here.
|
||||||
|
int pos = 0;
|
||||||
|
|
||||||
|
for (DocumentSentence sent : dld) {
|
||||||
|
for (var word : sent) {
|
||||||
|
pos++;
|
||||||
|
|
||||||
|
// Update span position tracking
|
||||||
|
for (var recorder : spanRecorders) {
|
||||||
|
recorder.update(sent, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (word.isStopWord()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String w = word.wordLowerCase();
|
||||||
|
if (matchesWordPattern(w)) {
|
||||||
|
/* Add information about term positions */
|
||||||
|
wordsBuilder.addPos(w, pos);
|
||||||
|
|
||||||
|
/* Add metadata for word */
|
||||||
|
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||||
|
WordRep rep = new WordRep(sent, names);
|
||||||
|
byte meta = metadata.getMetadataForWord(rep.stemmed);
|
||||||
|
|
||||||
|
wordsBuilder.addMeta(rep.word, meta);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pos++; // we need to add one more position to account for the last word in the document
|
||||||
|
|
||||||
|
for (var recorder : spanRecorders) {
|
||||||
|
wordsBuilder.addSpans(recorder.finish(pos));
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
void mapLinkTextPositions(int startPos,
|
||||||
|
DocumentKeywordsBuilder wordsBuilder,
|
||||||
|
KeywordMetadata metadata,
|
||||||
|
LinkTexts linkTexts)
|
||||||
|
{
|
||||||
|
int pos = startPos;
|
||||||
|
|
||||||
|
SpanRecorder extLinkRecorder = new SpanRecorder(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
|
||||||
|
LinkTexts.Iter iter = linkTexts.iterator();
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
|
||||||
|
DocumentSentence sentence = iter.sentence();
|
||||||
|
int count = iter.count();
|
||||||
|
|
||||||
|
// We repeat a link sentence a number of times that is a function of how many times it's been spotted
|
||||||
|
// as a link text. A really "big" link typically has hundreds, if not thousands of repetitions, so we
|
||||||
|
// attenuate that a bit with math so we don't generate a needlessly large positions list
|
||||||
|
|
||||||
|
final int repetitions = (int) Math.max(1, min(sqrt(count), 12));
|
||||||
|
|
||||||
|
for (int ci = 0; ci < repetitions; ci++) {
|
||||||
|
|
||||||
|
for (var word : sentence) {
|
||||||
|
pos++;
|
||||||
|
|
||||||
|
extLinkRecorder.update(sentence, pos);
|
||||||
|
|
||||||
|
if (word.isStopWord()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String w = word.wordLowerCase();
|
||||||
|
if (matchesWordPattern(w)) {
|
||||||
|
/* Add information about term positions */
|
||||||
|
wordsBuilder.addPos(w, pos);
|
||||||
|
|
||||||
|
/* Add metadata for word */
|
||||||
|
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a break between sentences, to prevent them being registered as one long run-on sentence
|
||||||
|
extLinkRecorder.endCurrentSpan(pos + 1);
|
||||||
|
|
||||||
|
// Also add some positional padding between separate link texts so we don't match across their boundaries
|
||||||
|
pos += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
wordsBuilder.addSpans(extLinkRecorder.finish(pos));
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean matchesWordPattern(String s) {
|
||||||
|
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
||||||
|
|
||||||
|
String wordPartSeparator = ".-_/:+*";
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
||||||
|
char c = s.charAt(i);
|
||||||
|
if (c >= 'a' && c <= 'z') continue;
|
||||||
|
if (c >= 'A' && c <= 'Z') continue;
|
||||||
|
if (c >= '0' && c <= '9') continue;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (int j = 0; j < 5; j++) {
|
||||||
|
if (i == s.length()) return true;
|
||||||
|
|
||||||
|
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
i++;
|
||||||
|
|
||||||
|
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
||||||
|
char c = s.charAt(i);
|
||||||
|
if (c >= 'a' && c <= 'z') continue;
|
||||||
|
if (c >= 'A' && c <= 'Z') continue;
|
||||||
|
if (c >= '0' && c <= '9') continue;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Helper class to record spans of words */
|
||||||
|
private static class SpanRecorder {
|
||||||
|
private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
|
||||||
|
private final HtmlTag htmlTag;
|
||||||
|
private int start = 0;
|
||||||
|
|
||||||
|
public SpanRecorder(HtmlTag htmlTag) {
|
||||||
|
this.htmlTag = htmlTag;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update(DocumentSentence sentence, int pos) {
|
||||||
|
assert pos > 0;
|
||||||
|
|
||||||
|
if (sentence.htmlTags.contains(htmlTag)) {
|
||||||
|
if (start <= 0) start = pos;
|
||||||
|
}
|
||||||
|
else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
|
||||||
|
{
|
||||||
|
// special case for body tag, we match against no tag on the sentence
|
||||||
|
if (start <= 0) start = pos;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (start > 0) {
|
||||||
|
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||||
|
start = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void endCurrentSpan(int pos) {
|
||||||
|
if (start > 0) {
|
||||||
|
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||||
|
start = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
||||||
|
if (start > 0) {
|
||||||
|
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
||||||
|
start = 0;
|
||||||
|
}
|
||||||
|
return spans;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -6,7 +6,7 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
|
|||||||
import nu.marginalia.keyword.extractors.UrlKeywords;
|
import nu.marginalia.keyword.extractors.UrlKeywords;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
|
||||||
class KeywordMetadata {
|
public class KeywordMetadata {
|
||||||
|
|
||||||
private final TitleKeywords titleKeywords;
|
private final TitleKeywords titleKeywords;
|
||||||
private final NameLikeKeywords nameLikeKeywords;
|
private final NameLikeKeywords nameLikeKeywords;
|
||||||
|
@ -1,19 +1,40 @@
|
|||||||
package nu.marginalia.keyword;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
|
import gnu.trove.list.TIntList;
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import nu.marginalia.language.model.DocumentSentence;
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public record LinkTexts(List<DocumentSentence> linkTexts) implements Iterable<DocumentSentence> {
|
public record LinkTexts(
|
||||||
|
List<DocumentSentence> linkTexts,
|
||||||
|
TIntList counts
|
||||||
|
) {
|
||||||
public LinkTexts() {
|
public LinkTexts() {
|
||||||
this(List.of());
|
this(List.of(), new TIntArrayList());
|
||||||
|
}
|
||||||
|
|
||||||
|
public int length() {
|
||||||
|
return linkTexts.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
@Override
|
public LinkTexts.Iter iterator() {
|
||||||
public Iterator<DocumentSentence> iterator() {
|
return new Iter();
|
||||||
return linkTexts.iterator();
|
}
|
||||||
|
|
||||||
|
public class Iter {
|
||||||
|
private int pos = -1;
|
||||||
|
|
||||||
|
public boolean next() {
|
||||||
|
return ++pos < length();
|
||||||
|
}
|
||||||
|
public int count() {
|
||||||
|
return counts.get(pos);
|
||||||
|
}
|
||||||
|
public DocumentSentence sentence() {
|
||||||
|
return linkTexts.get(pos);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,7 @@ import java.util.*;
|
|||||||
public class DocumentKeywordsBuilder {
|
public class DocumentKeywordsBuilder {
|
||||||
public final Object2ByteOpenHashMap<String> wordToMeta;
|
public final Object2ByteOpenHashMap<String> wordToMeta;
|
||||||
public final HashMap<String, IntList> wordToPos;
|
public final HashMap<String, IntList> wordToPos;
|
||||||
public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>();
|
public final Map<HtmlTag, List<DocumentWordSpan>> wordSpans = new HashMap<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* These ware keywords that had signals of high relevance
|
* These ware keywords that had signals of high relevance
|
||||||
@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder {
|
|||||||
positionsForTag.add(span.end());
|
positionsForTag.add(span.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag)));
|
spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
|
||||||
});
|
});
|
||||||
|
|
||||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
||||||
@ -128,7 +128,7 @@ public class DocumentKeywordsBuilder {
|
|||||||
|
|
||||||
public void addSpans(List<DocumentWordSpan> newSpans) {
|
public void addSpans(List<DocumentWordSpan> newSpans) {
|
||||||
for (var span : newSpans) {
|
for (var span : newSpans) {
|
||||||
wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span);
|
wordSpans.computeIfAbsent(span.tag(), k -> new ArrayList<>()).add(span);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,21 +25,6 @@ class DocumentKeywordExtractorTest {
|
|||||||
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
|
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
|
||||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testWordPattern() {
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("test"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
|
|
||||||
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
|
|
||||||
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
|
|
||||||
Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("c++"));
|
|
||||||
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
|
|
||||||
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testKeyboards2() throws IOException, URISyntaxException {
|
public void testKeyboards2() throws IOException, URISyntaxException {
|
||||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||||
|
@ -0,0 +1,184 @@
|
|||||||
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
|
import gnu.trove.list.TIntList;
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.EnumSet;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
class DocumentPositionMapperTest {
|
||||||
|
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||||
|
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWordPattern() {
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("test"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("1234567890abcde"));
|
||||||
|
Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
|
||||||
|
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
|
||||||
|
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
|
||||||
|
Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
|
||||||
|
Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBasic() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
DocumentLanguageData dld = new DocumentLanguageData(
|
||||||
|
se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
|
||||||
|
"I am a teapot"
|
||||||
|
);
|
||||||
|
|
||||||
|
int pos = positionMapper.mapDocumentPositions(keywordsBuilder, Mockito.mock(KeywordMetadata.class), dld);
|
||||||
|
|
||||||
|
assertEquals(8, pos);
|
||||||
|
assertEquals(IntList.of(1), keywordsBuilder.wordToPos.get("i"));
|
||||||
|
assertEquals(IntList.of(2), keywordsBuilder.wordToPos.get("am"));
|
||||||
|
assertEquals(IntList.of(3), keywordsBuilder.wordToPos.get("a"));
|
||||||
|
assertEquals(IntList.of(4), keywordsBuilder.wordToPos.get("teapot"));
|
||||||
|
assertEquals(IntList.of(5), keywordsBuilder.wordToPos.get("short"));
|
||||||
|
assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("and"));
|
||||||
|
assertEquals(IntList.of(7), keywordsBuilder.wordToPos.get("stout"));
|
||||||
|
|
||||||
|
var codeSpans = keywordsBuilder.wordSpans.get(HtmlTag.CODE);
|
||||||
|
assertEquals(1, codeSpans.size());
|
||||||
|
var codeSpan = codeSpans.getFirst();
|
||||||
|
|
||||||
|
assertEquals(1, codeSpan.start());
|
||||||
|
assertEquals(8, codeSpan.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinksSingleWord1Rep() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
|
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
assertEquals(1, sentences.size());
|
||||||
|
TIntList counts = new TIntArrayList(new int[] { 1 });
|
||||||
|
|
||||||
|
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
|
||||||
|
new LinkTexts(sentences, counts));
|
||||||
|
|
||||||
|
assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
|
||||||
|
|
||||||
|
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
assertEquals(1, linkTextSpans.size());
|
||||||
|
var codeSpan = linkTextSpans.getFirst();
|
||||||
|
|
||||||
|
assertEquals(6, codeSpan.start());
|
||||||
|
assertEquals(7, codeSpan.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinksSingleWord2Reps() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
|
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
assertEquals(1, sentences.size());
|
||||||
|
TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
|
||||||
|
|
||||||
|
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
|
||||||
|
new LinkTexts(sentences, counts));
|
||||||
|
|
||||||
|
assertEquals(IntList.of(6, 9), keywordsBuilder.wordToPos.get("zelda"));
|
||||||
|
|
||||||
|
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
assertEquals(2, linkTextSpans.size());
|
||||||
|
|
||||||
|
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||||
|
span = linkTextSpans.get(0);
|
||||||
|
|
||||||
|
assertEquals(6, span.start());
|
||||||
|
assertEquals(7, span.end());
|
||||||
|
|
||||||
|
span = linkTextSpans.get(1);
|
||||||
|
|
||||||
|
assertEquals(9, span.start());
|
||||||
|
assertEquals(10, span.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinksTwoWords2Reps() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
|
var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
assertEquals(1, sentences.size());
|
||||||
|
TIntList counts = new TIntArrayList(new int[] { 4 });
|
||||||
|
|
||||||
|
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
|
||||||
|
new LinkTexts(sentences, counts));
|
||||||
|
|
||||||
|
assertEquals(IntList.of(6, 10), keywordsBuilder.wordToPos.get("zelda"));
|
||||||
|
assertEquals(IntList.of(7, 11), keywordsBuilder.wordToPos.get("ii"));
|
||||||
|
|
||||||
|
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
assertEquals(2, linkTextSpans.size());
|
||||||
|
|
||||||
|
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||||
|
span = linkTextSpans.get(0);
|
||||||
|
|
||||||
|
assertEquals(6, span.start());
|
||||||
|
assertEquals(8, span.end());
|
||||||
|
|
||||||
|
span = linkTextSpans.get(1);
|
||||||
|
|
||||||
|
assertEquals(10, span.start());
|
||||||
|
assertEquals(12, span.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinksTwoSent1Word1Rep() {
|
||||||
|
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
|
var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||||
|
assertEquals(1, sentences1.size());
|
||||||
|
assertEquals(1, sentences2.size());
|
||||||
|
TIntList counts = new TIntArrayList(new int[] { 1, 1 });
|
||||||
|
|
||||||
|
List<DocumentSentence> sentencesAll = new ArrayList<>();
|
||||||
|
sentencesAll.addAll(sentences1);
|
||||||
|
sentencesAll.addAll(sentences2);
|
||||||
|
|
||||||
|
positionMapper.mapLinkTextPositions(5, keywordsBuilder, Mockito.mock(KeywordMetadata.class),
|
||||||
|
new LinkTexts(sentencesAll, counts));
|
||||||
|
|
||||||
|
assertEquals(IntList.of(6), keywordsBuilder.wordToPos.get("zelda"));
|
||||||
|
assertEquals(IntList.of(9), keywordsBuilder.wordToPos.get("link"));
|
||||||
|
|
||||||
|
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||||
|
assertEquals(2, linkTextSpans.size());
|
||||||
|
|
||||||
|
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||||
|
span = linkTextSpans.get(0);
|
||||||
|
|
||||||
|
assertEquals(6, span.start());
|
||||||
|
assertEquals(7, span.end());
|
||||||
|
|
||||||
|
span = linkTextSpans.get(1);
|
||||||
|
|
||||||
|
assertEquals(9, span.start());
|
||||||
|
assertEquals(10, span.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor;
|
|||||||
|
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
@ -14,6 +15,9 @@ public class DocumentDecorator {
|
|||||||
public void addTerm(String term) {
|
public void addTerm(String term) {
|
||||||
extraSearchTerms.add(term);
|
extraSearchTerms.add(term);
|
||||||
}
|
}
|
||||||
|
public void addTerms(Collection<String> terms) {
|
||||||
|
extraSearchTerms.addAll(terms);
|
||||||
|
}
|
||||||
|
|
||||||
public void apply(ProcessedDocument doc) {
|
public void apply(ProcessedDocument doc) {
|
||||||
if (doc == null)
|
if (doc == null)
|
||||||
|
@ -15,6 +15,7 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
|||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
||||||
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -118,6 +119,10 @@ public class DocumentProcessor {
|
|||||||
ret.details = detailsWithWords.details();
|
ret.details = detailsWithWords.details();
|
||||||
ret.words = detailsWithWords.words();
|
ret.words = detailsWithWords.words();
|
||||||
|
|
||||||
|
if (url.path.equals("/")) {
|
||||||
|
ret.words.addMeta("special:root", WordFlags.Synthetic.asBit());
|
||||||
|
}
|
||||||
|
|
||||||
documentDecorator.apply(ret);
|
documentDecorator.apply(ret);
|
||||||
|
|
||||||
if (Boolean.TRUE.equals(crawledDocument.hasCookies)
|
if (Boolean.TRUE.equals(crawledDocument.hasCookies)
|
||||||
|
@ -66,6 +66,16 @@ public class DomainProcessor {
|
|||||||
return fullProcessing(domain);
|
return fullProcessing(domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
|
||||||
|
try {
|
||||||
|
return new SideloadProcessing(dataStream, sizeHint, extraKeywords);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.warn("Failed to process domain sideload", ex);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) {
|
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) {
|
||||||
try {
|
try {
|
||||||
return new SideloadProcessing(dataStream, sizeHint);
|
return new SideloadProcessing(dataStream, sizeHint);
|
||||||
@ -74,7 +84,6 @@ public class DomainProcessor {
|
|||||||
logger.warn("Failed to process domain sideload", ex);
|
logger.warn("Failed to process domain sideload", ex);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
|
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
|
||||||
@ -89,6 +98,10 @@ public class DomainProcessor {
|
|||||||
);
|
);
|
||||||
|
|
||||||
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
|
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
|
||||||
|
this(dataStream, sizeHint, List.of());
|
||||||
|
}
|
||||||
|
|
||||||
|
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) throws IOException {
|
||||||
this.dataStream = dataStream;
|
this.dataStream = dataStream;
|
||||||
|
|
||||||
if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
|
if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
|
||||||
@ -100,6 +113,7 @@ public class DomainProcessor {
|
|||||||
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
|
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
|
||||||
|
|
||||||
documentDecorator = new DocumentDecorator();
|
documentDecorator = new DocumentDecorator();
|
||||||
|
documentDecorator.addTerms(extraKeywords);
|
||||||
|
|
||||||
processDomain(crawledDomain, domain, documentDecorator);
|
processDomain(crawledDomain, domain, documentDecorator);
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic;
|
|||||||
|
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.model.GeneratorType;
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
@ -13,7 +14,12 @@ import java.util.List;
|
|||||||
public class DocumentGeneratorExtractor {
|
public class DocumentGeneratorExtractor {
|
||||||
private static final String defaultValue = "unset";
|
private static final String defaultValue = "unset";
|
||||||
|
|
||||||
public DocumentGenerator detectGenerator(Document doc, DocumentHeaders responseHeaders) {
|
public DocumentGenerator detectGenerator(EdgeUrl url, Document doc, DocumentHeaders responseHeaders) {
|
||||||
|
|
||||||
|
// Fextralife leaves no known tech fingerprint, but we know it's a wiki software of some sort
|
||||||
|
if (url.domain.toString().endsWith(".wiki.fextralife.com")) {
|
||||||
|
return DocumentGenerator.of("wiki");
|
||||||
|
}
|
||||||
|
|
||||||
var tags = doc.select("meta[name=generator]");
|
var tags = doc.select("meta[name=generator]");
|
||||||
|
|
||||||
@ -57,6 +63,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
case "one.com":
|
case "one.com":
|
||||||
case "wix.com":
|
case "wix.com":
|
||||||
case "wpbakery":
|
case "wpbakery":
|
||||||
|
case "FluxGarden":
|
||||||
return DocumentGenerator.of(parts[0]);
|
return DocumentGenerator.of(parts[0]);
|
||||||
case "adobe":
|
case "adobe":
|
||||||
case "microsoft":
|
case "microsoft":
|
||||||
@ -68,6 +75,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (parts.length > 1) {
|
if (parts.length > 1) {
|
||||||
return DocumentGenerator.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
|
return DocumentGenerator.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
|
||||||
}
|
}
|
||||||
@ -183,7 +191,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
return DocumentGenerator.of("apache");
|
return DocumentGenerator.of("apache");
|
||||||
}
|
}
|
||||||
if (header.contains("server: cowboy")) {
|
if (header.contains("server: cowboy")) {
|
||||||
return DocumentGenerator.of("cowboy"); // erlang, really?!
|
return DocumentGenerator.of("cowboy"); // erlang, apparently
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -281,7 +289,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
-> GeneratorType.FORUM;
|
-> GeneratorType.FORUM;
|
||||||
case "mediawiki", "dokuwiki", "wikidot", "sharepoint"
|
case "mediawiki", "dokuwiki", "wikidot", "sharepoint"
|
||||||
-> GeneratorType.WIKI;
|
-> GeneratorType.WIKI;
|
||||||
case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc"
|
case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden", "wiki"
|
||||||
-> GeneratorType.DOCS;
|
-> GeneratorType.DOCS;
|
||||||
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass"
|
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass"
|
||||||
-> GeneratorType.ECOMMERCE_AND_SPAM;
|
-> GeneratorType.ECOMMERCE_AND_SPAM;
|
||||||
|
@ -129,7 +129,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||||
final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers);
|
final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers);
|
||||||
|
|
||||||
final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, documentHeaders);
|
final var generatorParts = documentGeneratorExtractor.detectGenerator(url, doc, documentHeaders);
|
||||||
|
|
||||||
final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
|
final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
|
||||||
|
|
||||||
|
@ -65,8 +65,7 @@ public class SideloadSourceFactory {
|
|||||||
public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException {
|
public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException {
|
||||||
return sideload(pathToDbFiles,
|
return sideload(pathToDbFiles,
|
||||||
new PathSuffixPredicate(".db"),
|
new PathSuffixPredicate(".db"),
|
||||||
(List<Path> paths) -> new RedditSideloader(paths,
|
(List<Path> paths) -> new RedditSideloader(paths, anchorTextKeywords, sideloaderProcessing));
|
||||||
anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
|
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.converting.sideload.reddit;
|
|||||||
|
|
||||||
import nu.marginalia.atags.AnchorTextKeywords;
|
import nu.marginalia.atags.AnchorTextKeywords;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
|
||||||
import nu.marginalia.converting.model.GeneratorType;
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
@ -13,7 +12,7 @@ import nu.marginalia.integration.reddit.db.RedditDb;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.util.ProcessingIterator;
|
import nu.marginalia.util.ProcessingIterator;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
@ -30,16 +29,13 @@ public class RedditSideloader implements SideloadSource {
|
|||||||
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
|
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
|
||||||
|
|
||||||
private final List<Path> dbFiles;
|
private final List<Path> dbFiles;
|
||||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
|
||||||
private final AnchorTextKeywords anchorTextKeywords;
|
private final AnchorTextKeywords anchorTextKeywords;
|
||||||
private final SideloaderProcessing sideloaderProcessing;
|
private final SideloaderProcessing sideloaderProcessing;
|
||||||
|
|
||||||
public RedditSideloader(List<Path> listToDbFiles,
|
public RedditSideloader(List<Path> listToDbFiles,
|
||||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
|
||||||
AnchorTextKeywords anchorTextKeywords,
|
AnchorTextKeywords anchorTextKeywords,
|
||||||
SideloaderProcessing sideloaderProcessing) {
|
SideloaderProcessing sideloaderProcessing) {
|
||||||
this.dbFiles = listToDbFiles;
|
this.dbFiles = listToDbFiles;
|
||||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
|
||||||
this.anchorTextKeywords = anchorTextKeywords;
|
this.anchorTextKeywords = anchorTextKeywords;
|
||||||
this.sideloaderProcessing = sideloaderProcessing;
|
this.sideloaderProcessing = sideloaderProcessing;
|
||||||
}
|
}
|
||||||
@ -116,14 +112,25 @@ public class RedditSideloader implements SideloadSource {
|
|||||||
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
|
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
|
||||||
.getYear();
|
.getYear();
|
||||||
|
|
||||||
String fullHtml = "<!DOCTYPE html>\n<html>\n<head>\n <title>" + title + "</title>\n <script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>\n</head>\n<body>\n <h1>" + title + "</h1>\n <article>\n <p>" + body + "</p>\n </article>\n</body>\n</html>\n";
|
String fullHtml = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>%s</title>
|
||||||
|
<script src="https://www.example.com/dummy.js" type="text/javascript"></script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>%s</h1>
|
||||||
|
<h2>reddit r/%s %s</h2>
|
||||||
|
<article>
|
||||||
|
<p>%s</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".formatted(title, title, subreddit, subreddit, body);
|
||||||
|
|
||||||
List<String> extraKeywords = new ArrayList<>();
|
List<String> extraKeywords = new ArrayList<>();
|
||||||
|
|
||||||
extraKeywords.add("reddit");
|
|
||||||
extraKeywords.add(subreddit);
|
|
||||||
extraKeywords.add("r/" + subreddit);
|
|
||||||
|
|
||||||
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
|
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
|
||||||
extraKeywords.add(author);
|
extraKeywords.add(author);
|
||||||
}
|
}
|
||||||
@ -147,12 +154,18 @@ public class RedditSideloader implements SideloadSource {
|
|||||||
|
|
||||||
|
|
||||||
if (doc.isProcessedFully()) {
|
if (doc.isProcessedFully()) {
|
||||||
for (var keyword : extraKeywords) {
|
// Insert topology information
|
||||||
doc.words.addMeta(keyword, WordFlags.Subjects.asBit());
|
if (doc.details != null) {
|
||||||
|
doc.details.metadata.withSizeAndTopology(50_000_000, score);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert topology information
|
if (doc.words != null) {
|
||||||
doc.details.metadata.withSizeAndTopology(50_000_000, score);
|
doc.words.addAllSyntheticTerms(List.of("generator:forum",
|
||||||
|
HtmlFeature.COOKIES.getKeyword(),
|
||||||
|
HtmlFeature.JS.getKeyword(),
|
||||||
|
HtmlFeature.TRACKING_ADTECH.getKeyword()
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,10 +24,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
import java.util.*;
|
||||||
import java.util.EnumSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.ArrayBlockingQueue;
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
@ -36,6 +33,8 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
private final DocumentKeywordExtractor keywordExtractor;
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
private final String domainName;
|
private final String domainName;
|
||||||
|
|
||||||
|
private final EnumSet<HtmlFeature> applyFeatures = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
|
||||||
|
|
||||||
private final Path dbFile;
|
private final Path dbFile;
|
||||||
|
|
||||||
public StackexchangeSideloader(Path pathToDbFile,
|
public StackexchangeSideloader(Path pathToDbFile,
|
||||||
@ -133,12 +132,17 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
|
|
||||||
ret.url = url;
|
ret.url = url;
|
||||||
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
|
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
|
||||||
ret.words.addAllSyntheticTerms(List.of(
|
|
||||||
"site:" + domainName,
|
List<String> syntheticTerms = new ArrayList<>(
|
||||||
"site:" + url.domain.topDomain,
|
List.of("site:" + domainName,
|
||||||
url.domain.topDomain,
|
"site:" + url.domain.topDomain,
|
||||||
domainName
|
url.domain.topDomain,
|
||||||
));
|
domainName)
|
||||||
|
);
|
||||||
|
for (HtmlFeature feature : applyFeatures) {
|
||||||
|
syntheticTerms.add(feature.getKeyword());
|
||||||
|
}
|
||||||
|
ret.words.addAllSyntheticTerms(syntheticTerms);
|
||||||
|
|
||||||
if (!post.tags().isBlank()) {
|
if (!post.tags().isBlank()) {
|
||||||
List<String> subjects = Arrays.asList(post.tags().split(","));
|
List<String> subjects = Arrays.asList(post.tags().split(","));
|
||||||
@ -152,7 +156,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
PubDate.toYearByte(ret.details.pubYear),
|
PubDate.toYearByte(ret.details.pubYear),
|
||||||
(int) -ret.details.quality,
|
(int) -ret.details.quality,
|
||||||
EnumSet.of(DocumentFlags.GeneratorDocs));
|
EnumSet.of(DocumentFlags.GeneratorDocs));
|
||||||
ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
|
ret.details.features = applyFeatures;
|
||||||
|
|
||||||
ret.details.metadata.withSizeAndTopology(10000, 0);
|
ret.details.metadata.withSizeAndTopology(10000, 0);
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@ import nu.marginalia.model.processed.SlopDocumentRecord;
|
|||||||
import nu.marginalia.model.processed.SlopDomainLinkRecord;
|
import nu.marginalia.model.processed.SlopDomainLinkRecord;
|
||||||
import nu.marginalia.model.processed.SlopDomainRecord;
|
import nu.marginalia.model.processed.SlopDomainRecord;
|
||||||
import nu.marginalia.sequence.VarintCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -32,20 +33,26 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);
|
private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);
|
||||||
|
|
||||||
public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
|
public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
|
||||||
if (!Files.exists(ProcessedDataFileNames.domainFileName(basePath))) {
|
Path domainPath = initSlopDir(ProcessedDataFileNames.domainFileName(basePath));
|
||||||
Files.createDirectory(ProcessedDataFileNames.domainFileName(basePath));
|
Path linksPath = initSlopDir(ProcessedDataFileNames.domainLinkFileName(basePath));
|
||||||
}
|
Path docsPath = initSlopDir(ProcessedDataFileNames.documentFileName(basePath));
|
||||||
domainWriter = new SlopDomainRecord.Writer(ProcessedDataFileNames.domainFileName(basePath), batchNumber);
|
|
||||||
|
|
||||||
if (!Files.exists(ProcessedDataFileNames.domainLinkFileName(basePath))) {
|
domainWriter = new SlopDomainRecord.Writer(domainPath, batchNumber);
|
||||||
Files.createDirectory(ProcessedDataFileNames.domainLinkFileName(basePath));
|
domainLinkWriter = new SlopDomainLinkRecord.Writer(linksPath, batchNumber);
|
||||||
}
|
documentWriter = new SlopDocumentRecord.Writer(docsPath, batchNumber);
|
||||||
domainLinkWriter = new SlopDomainLinkRecord.Writer(ProcessedDataFileNames.domainLinkFileName(basePath), batchNumber);
|
}
|
||||||
|
|
||||||
if (!Files.exists(ProcessedDataFileNames.documentFileName(basePath))) {
|
private Path initSlopDir(Path p) throws IOException {
|
||||||
Files.createDirectory(ProcessedDataFileNames.documentFileName(basePath));
|
if (Files.isDirectory(p)) {
|
||||||
|
FileUtils.deleteDirectory(p.toFile());
|
||||||
}
|
}
|
||||||
documentWriter = new SlopDocumentRecord.Writer(ProcessedDataFileNames.documentFileName(basePath), batchNumber);
|
else if (Files.exists(p)) {
|
||||||
|
Files.delete(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
Files.createDirectories(p);
|
||||||
|
|
||||||
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Sets the lowest ordinal value for the documents in this batch */
|
/** Sets the lowest ordinal value for the documents in this batch */
|
||||||
@ -114,7 +121,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
documentWriter.write(new SlopDocumentRecord(
|
documentWriter.write(new SlopDocumentRecord(
|
||||||
domainName,
|
domainName,
|
||||||
document.url.toString(),
|
document.url.toString(),
|
||||||
ordinal,
|
ordinal++,
|
||||||
document.state.toString(),
|
document.state.toString(),
|
||||||
document.stateReason,
|
document.stateReason,
|
||||||
document.details.title,
|
document.details.title,
|
||||||
@ -132,17 +139,15 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
spanCodes,
|
spanCodes,
|
||||||
spanSequences
|
spanSequences
|
||||||
));
|
));
|
||||||
|
|
||||||
ordinal++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object writeLinkData(ProcessedDomain domain) throws IOException {
|
private void writeLinkData(ProcessedDomain domain) throws IOException {
|
||||||
String from = domain.domain.toString();
|
String from = domain.domain.toString();
|
||||||
|
|
||||||
if (domain.documents == null)
|
if (domain.documents == null)
|
||||||
return this;
|
return;
|
||||||
|
|
||||||
Set<EdgeDomain> seen = new HashSet<>();
|
Set<EdgeDomain> seen = new HashSet<>();
|
||||||
|
|
||||||
@ -171,10 +176,9 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object writeDomainData(ProcessedDomain domain) throws IOException {
|
public void writeDomainData(ProcessedDomain domain) throws IOException {
|
||||||
DomainMetadata metadata = DomainMetadata.from(domain);
|
DomainMetadata metadata = DomainMetadata.from(domain);
|
||||||
|
|
||||||
List<String> feeds = getFeedUrls(domain);
|
List<String> feeds = getFeedUrls(domain);
|
||||||
@ -191,8 +195,6 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
feeds
|
feeds
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> getFeedUrls(ProcessedDomain domain) {
|
private List<String> getFeedUrls(ProcessedDomain domain) {
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
|||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
@ -34,8 +35,8 @@ class JavadocSpecializationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void generatorExtraction() {
|
void generatorExtraction() throws Exception {
|
||||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
|
var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders(""));
|
||||||
|
|
||||||
System.out.println(gen);
|
System.out.println(gen);
|
||||||
}
|
}
|
||||||
|
@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
|||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
class LemmySpecializationTest {
|
class LemmySpecializationTest {
|
||||||
@ -37,9 +39,9 @@ class LemmySpecializationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void generatorExtraction() {
|
void generatorExtraction() throws URISyntaxException {
|
||||||
var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
|
var generatorIndex = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
|
||||||
var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), new DocumentHeaders(""));
|
var generatorPost = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyPost), new DocumentHeaders(""));
|
||||||
|
|
||||||
System.out.println(generatorIndex);
|
System.out.println(generatorIndex);
|
||||||
System.out.println(generatorPost);
|
System.out.println(generatorPost);
|
||||||
|
@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
|||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
class XenForoSpecializationTest {
|
class XenForoSpecializationTest {
|
||||||
@ -34,8 +36,8 @@ class XenForoSpecializationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void generatorExtraction() {
|
void generatorExtraction() throws URISyntaxException {
|
||||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
|
var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders(""));
|
||||||
|
|
||||||
System.out.println(gen);
|
System.out.println(gen);
|
||||||
}
|
}
|
||||||
|
@ -42,7 +42,8 @@ public class LinkParser {
|
|||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
.flatMap(this::createEdgeUrl);
|
.flatMap(this::createEdgeUrl)
|
||||||
|
.filter(url -> !hasBinarySuffix(url.path));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Contract(pure=true)
|
@Contract(pure=true)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import nu.marginalia.ContentTypes;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.lsh.EasyLSH;
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
@ -43,6 +44,9 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
try {
|
try {
|
||||||
while (data.hasNext()) {
|
while (data.hasNext()) {
|
||||||
if (data.next() instanceof CrawledDocument doc) {
|
if (data.next() instanceof CrawledDocument doc) {
|
||||||
|
if (!ContentTypes.isAccepted(doc.contentType))
|
||||||
|
continue;
|
||||||
|
|
||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -317,26 +317,24 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
long probeStart = System.currentTimeMillis();
|
long probeStart = System.currentTimeMillis();
|
||||||
|
|
||||||
/*
|
|
||||||
probing is on probation for now while we evaluate how much the added delays slows down the crawler
|
|
||||||
|
|
||||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||||
|
retryLoop:
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||||
try {
|
try {
|
||||||
var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
|
var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
|
||||||
|
|
||||||
if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Ok ok) {
|
switch (probeResult) {
|
||||||
url = ok.resolvedUrl(); // If we were redirected while probing, use the final URL for fetching
|
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||||
break;
|
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||||
} else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType badContentType) {
|
break retryLoop;
|
||||||
return new HttpFetchResult.ResultNone();
|
case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType:
|
||||||
} else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout) {
|
return new HttpFetchResult.ResultNone();
|
||||||
return new HttpFetchResult.ResultException(timeout.ex());
|
case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout:
|
||||||
} else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Exception exception) {
|
return new HttpFetchResult.ResultException(timeout.ex());
|
||||||
return new HttpFetchResult.ResultException(exception.ex());
|
case HttpFetcher.ContentTypeProbeResult.Exception exception:
|
||||||
}
|
return new HttpFetchResult.ResultException(exception.ex());
|
||||||
else { // should be unreachable
|
default: // should be unreachable
|
||||||
throw new IllegalStateException("Unknown probe result");
|
throw new IllegalStateException("Unknown probe result");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
catch (HttpFetcherImpl.RateLimitException ex) {
|
||||||
@ -348,8 +346,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
|
timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
|
||||||
}*/
|
}
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||||
|
@ -0,0 +1,22 @@
|
|||||||
|
package nu.marginalia;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class ContentTypes {
|
||||||
|
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||||
|
"application/xhtml",
|
||||||
|
"text/html",
|
||||||
|
"image/x-icon",
|
||||||
|
"text/plain");
|
||||||
|
|
||||||
|
public static boolean isAccepted(String contentTypeHeader) {
|
||||||
|
String lcHeader = contentTypeHeader.toLowerCase();
|
||||||
|
for (var type : acceptedContentTypes) {
|
||||||
|
if (lcHeader.startsWith(type)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.parquet.crawldata;
|
package nu.marginalia.parquet.crawldata;
|
||||||
|
|
||||||
import blue.strategic.parquet.ParquetWriter;
|
import blue.strategic.parquet.ParquetWriter;
|
||||||
|
import nu.marginalia.ContentTypes;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.DocumentBodyResult;
|
import nu.marginalia.model.body.DocumentBodyResult;
|
||||||
@ -62,6 +63,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** Return true if the WarcResponse should be excluded from conversion */
|
/** Return true if the WarcResponse should be excluded from conversion */
|
||||||
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
|
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
|
||||||
|
|
||||||
@ -74,14 +77,25 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
var robotsTags = response.http().headers().all("X-Robots-Tag");
|
var headers = response.http().headers();
|
||||||
|
var robotsTags = headers.all("X-Robots-Tag");
|
||||||
|
|
||||||
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Strip out responses with content types we aren't interested in
|
||||||
|
// (though ideally we wouldn't download these at all)
|
||||||
|
String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();
|
||||||
|
|
||||||
|
if (!ContentTypes.isAccepted(contentType)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void write(String domain, WarcXEntityRefused refused) throws IOException {
|
private void write(String domain, WarcXEntityRefused refused) throws IOException {
|
||||||
URI profile = refused.profile();
|
URI profile = refused.profile();
|
||||||
|
|
||||||
|
@ -157,10 +157,10 @@ class WarcRecorderTest {
|
|||||||
fileNameParquet);
|
fileNameParquet);
|
||||||
|
|
||||||
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
||||||
assertEquals(3, urls.size());
|
assertEquals(2, urls.size());
|
||||||
assertEquals("https://www.marginalia.nu/", urls.get(0));
|
assertEquals("https://www.marginalia.nu/", urls.get(0));
|
||||||
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
|
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
|
||||||
assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2));
|
// sanic.jpg gets filtered out for its bad mime type
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,6 +41,7 @@ import java.time.temporal.ChronoUnit;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX;
|
import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX;
|
||||||
|
|
||||||
@ -196,7 +197,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
|||||||
writer.setOrdinalOffset(67_000_000);
|
writer.setOrdinalOffset(67_000_000);
|
||||||
|
|
||||||
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
|
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
|
||||||
writer.write(domainProcessor.sideloadProcessing(stream, 0));
|
writer.write(domainProcessor.sideloadProcessing(stream, 0, Set.of("special:live")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,7 +75,6 @@ public class DocumentLoaderService {
|
|||||||
|
|
||||||
public void accept(SlopDocumentRecord.MetadataProjection projection)
|
public void accept(SlopDocumentRecord.MetadataProjection projection)
|
||||||
{
|
{
|
||||||
|
|
||||||
long urlId = UrlIdCodec.encodeId(
|
long urlId = UrlIdCodec.encodeId(
|
||||||
domainIdRegistry.getDomainId(projection.domain()),
|
domainIdRegistry.getDomainId(projection.domain()),
|
||||||
projection.ordinal()
|
projection.ordinal()
|
||||||
@ -88,7 +87,7 @@ public class DocumentLoaderService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
documentDbWriter.add(new DocdbUrlDetail(
|
details.add(new DocdbUrlDetail(
|
||||||
urlId,
|
urlId,
|
||||||
parsedUrl.get(),
|
parsedUrl.get(),
|
||||||
projection.title(),
|
projection.title(),
|
||||||
|
Loading…
Reference in New Issue
Block a user