mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Processor fixes: Excluding phpinfo()-pages, mastodon feeds.
This commit is contained in:
parent
13c8305dc2
commit
179d54d50a
@ -167,6 +167,7 @@ public class DocumentKeywordExtractor {
|
||||
if (lc.length() > 6
|
||||
&& lc.indexOf('@') > 0
|
||||
&& mailLikePattern.matcher(lc).matches()) {
|
||||
|
||||
reps.add(lc);
|
||||
|
||||
String domain = lc.substring(lc.indexOf('@'));
|
||||
|
@ -176,6 +176,8 @@ public class DocumentProcessor {
|
||||
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
||||
}
|
||||
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
|
||||
Document prunedDoc = doc.clone();
|
||||
|
||||
prunedDoc.getElementsByTag("svg").remove();
|
||||
@ -194,10 +196,12 @@ public class DocumentProcessor {
|
||||
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||
|
||||
final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
|
||||
|
||||
EdgePageWordSet words;
|
||||
if (doSimpleProcessing) {
|
||||
if (shouldDoSimpleProcessing(url, ret)) {
|
||||
/* Some documents we'll index, but only superficially. This is a compromise
|
||||
to allow them to be discoverable, without having them show up without specific
|
||||
queries. This also saves a lot of processing power.
|
||||
*/
|
||||
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
||||
words = keywordExtractor.extractKeywordsMinimal(dld);
|
||||
ret.description = "";
|
||||
@ -208,7 +212,6 @@ public class DocumentProcessor {
|
||||
ret.description = getDescription(doc);
|
||||
}
|
||||
|
||||
var url = new EdgeUrl(crawledDocument.url);
|
||||
addMetaWords(ret, url, crawledDomain, words);
|
||||
|
||||
getLinks(url, ret, doc, words);
|
||||
@ -216,6 +219,29 @@ public class DocumentProcessor {
|
||||
return new DetailsWithWords(ret, words);
|
||||
}
|
||||
|
||||
private boolean shouldDoSimpleProcessing(EdgeUrl url, ProcessedDocumentDetails ret) {
|
||||
if (ret.quality < minDocumentQuality) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// These pages shouldn't be publicly accessible
|
||||
if ("phpinfo()".equals(ret.title)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Urls that look like /@foo are typically Mastodon or other twitter-like feeds,
|
||||
// we don't want to index them because they change so rapidly; subdirectories are
|
||||
// fine though
|
||||
//
|
||||
// The first startsWith criteria is a performance optimization, even with a compiled
|
||||
// pattern it is something like 50x faster
|
||||
if (url.path.startsWith("/@") && url.path.matches("^/@[^/]+/?$")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) {
|
||||
List<String> tagWords = new ArrayList<>();
|
||||
|
||||
|
@ -4,7 +4,8 @@ import nu.marginalia.util.language.LanguageFilter;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class LanguageFilterTest {
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user