mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Add academia recognition to DomainProcessor
The code now includes an additional function in the DomainProcessor class that checks if a domain is associated with academia. An academic domain is identified by the ".edu" TLD, or fits a specific regex pattern matching domains like *.ac.ccTld or *.edu.ccTld. If these conditions are met, the search term "special:academia" is added to the domain. The existing academia search filter uses personalized pagerank to select academia-adjacent domains, but it isn't working very well. The hope is that filtering on domain names will be more effective, and that it can supplant the ranking-based approach.
This commit is contained in:
parent
156c067f79
commit
fc30da0d48
@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class DomainProcessor {
|
||||
private final DocumentProcessor documentProcessor;
|
||||
@ -94,6 +95,10 @@ public class DomainProcessor {
|
||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||
}
|
||||
|
||||
if (isAcademicDomain(ret.domain)) {
|
||||
terms.add("special:academia");
|
||||
}
|
||||
|
||||
for (var document : ret.documents) {
|
||||
if (document.details == null)
|
||||
continue;
|
||||
@ -114,6 +119,19 @@ public class DomainProcessor {
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
|
||||
private boolean isAcademicDomain(EdgeDomain domain) {
|
||||
|
||||
if (domain.domain.endsWith(".edu"))
|
||||
return true;
|
||||
|
||||
if (academicPattern.matcher(domain.domain).matches())
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private void fixBadCanonicalTag(CrawledDocument doc) {
|
||||
// Some sites have a canonical tag that points to a different domain,
|
||||
// but our loader can not support this, so we point these back to the
|
||||
|
Loading…
Reference in New Issue
Block a user