mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Processor fixes: Excluding phpinfo()-pages, mastodon feeds.
This commit is contained in:
parent
13c8305dc2
commit
179d54d50a
@ -167,6 +167,7 @@ public class DocumentKeywordExtractor {
|
|||||||
if (lc.length() > 6
|
if (lc.length() > 6
|
||||||
&& lc.indexOf('@') > 0
|
&& lc.indexOf('@') > 0
|
||||||
&& mailLikePattern.matcher(lc).matches()) {
|
&& mailLikePattern.matcher(lc).matches()) {
|
||||||
|
|
||||||
reps.add(lc);
|
reps.add(lc);
|
||||||
|
|
||||||
String domain = lc.substring(lc.indexOf('@'));
|
String domain = lc.substring(lc.indexOf('@'));
|
||||||
|
@ -176,6 +176,8 @@ public class DocumentProcessor {
|
|||||||
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||||
|
|
||||||
Document prunedDoc = doc.clone();
|
Document prunedDoc = doc.clone();
|
||||||
|
|
||||||
prunedDoc.getElementsByTag("svg").remove();
|
prunedDoc.getElementsByTag("svg").remove();
|
||||||
@ -194,10 +196,12 @@ public class DocumentProcessor {
|
|||||||
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
||||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||||
|
|
||||||
final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
|
|
||||||
|
|
||||||
EdgePageWordSet words;
|
EdgePageWordSet words;
|
||||||
if (doSimpleProcessing) {
|
if (shouldDoSimpleProcessing(url, ret)) {
|
||||||
|
/* Some documents we'll index, but only superficially. This is a compromise
|
||||||
|
to allow them to be discoverable, without having them show up without specific
|
||||||
|
queries. This also saves a lot of processing power.
|
||||||
|
*/
|
||||||
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
||||||
words = keywordExtractor.extractKeywordsMinimal(dld);
|
words = keywordExtractor.extractKeywordsMinimal(dld);
|
||||||
ret.description = "";
|
ret.description = "";
|
||||||
@ -208,7 +212,6 @@ public class DocumentProcessor {
|
|||||||
ret.description = getDescription(doc);
|
ret.description = getDescription(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
var url = new EdgeUrl(crawledDocument.url);
|
|
||||||
addMetaWords(ret, url, crawledDomain, words);
|
addMetaWords(ret, url, crawledDomain, words);
|
||||||
|
|
||||||
getLinks(url, ret, doc, words);
|
getLinks(url, ret, doc, words);
|
||||||
@ -216,6 +219,29 @@ public class DocumentProcessor {
|
|||||||
return new DetailsWithWords(ret, words);
|
return new DetailsWithWords(ret, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean shouldDoSimpleProcessing(EdgeUrl url, ProcessedDocumentDetails ret) {
|
||||||
|
if (ret.quality < minDocumentQuality) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// These pages shouldn't be publicly accessible
|
||||||
|
if ("phpinfo()".equals(ret.title)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Urls that look like /@foo are typically Mastodon or other twitter-like feeds,
|
||||||
|
// we don't want to index them because they change so rapidly; subdirectories are
|
||||||
|
// fine though
|
||||||
|
//
|
||||||
|
// The first startsWith criteria is a performance optimization, even with a compiled
|
||||||
|
// pattern it is something like 50x faster
|
||||||
|
if (url.path.startsWith("/@") && url.path.matches("^/@[^/]+/?$")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) {
|
private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) {
|
||||||
List<String> tagWords = new ArrayList<>();
|
List<String> tagWords = new ArrayList<>();
|
||||||
|
|
||||||
|
@ -4,7 +4,8 @@ import nu.marginalia.util.language.LanguageFilter;
|
|||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
class LanguageFilterTest {
|
class LanguageFilterTest {
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user