mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Tidying up the HTML plugin.
This commit is contained in:
parent
384de2e54b
commit
be040419f3
@ -20,6 +20,9 @@ public enum EdgePageWordFlags {
|
||||
* @see NameCounter */
|
||||
NamesWords,
|
||||
|
||||
/** The word isn't actually a word on page, but a fake keyword from the code
|
||||
* to aid discovery
|
||||
*/
|
||||
Synthetic,
|
||||
|
||||
/** Word is important to site
|
||||
|
@ -33,6 +33,8 @@ import java.util.EnumSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import static nu.marginalia.converting.model.DisqualifiedException.*;
|
||||
|
||||
|
||||
public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||
|
||||
@ -83,13 +85,13 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
String documentBody = crawledDocument.documentBody.decode();
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
Document doc = Jsoup.parse(documentBody);
|
||||
|
||||
if (doc.select("meta[name=robots]").attr("content").contains("noindex")) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.FORBIDDEN);
|
||||
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
||||
}
|
||||
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
@ -113,7 +115,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
// don't move this up! it uses title and quality
|
||||
// and is run before the heavy computations below
|
||||
if (isDisqualified(url, dld, ret)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.QUALITY);
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
|
||||
KeywordMetadata keywordMetadata = new KeywordMetadata();
|
||||
@ -206,10 +208,13 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
.ifPresent(lp::acceptFeed);
|
||||
}
|
||||
|
||||
createLinkKeywords(words, lp);
|
||||
createFileLinkKeywords(words, lp, domain);
|
||||
createLinkKeywords(words, lp);
|
||||
}
|
||||
|
||||
// If a document links to a file on the same server, and that file has
|
||||
// a salient file ending, then add the filename as a keyword so that it can
|
||||
// be found
|
||||
private void createFileLinkKeywords(DocumentKeywordsBuilder words, LinkProcessor lp, EdgeDomain domain) {
|
||||
Set<String> fileKeywords = new HashSet<>(100);
|
||||
for (var link : lp.getNonIndexableUrls()) {
|
||||
|
Loading…
Reference in New Issue
Block a user