mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(converter) Add academia recognition to DomainProcessor
The code now includes an additional function in the DomainProcessor class that checks if a domain is associated with academia. An academic domain is identified by the ".edu" TLD, or fits a specific regex pattern matching domains like *.ac.ccTld or *.edu.ccTld. If these conditions are met, the search term "special:academia" is added to the domain. The existing academia search filter uses personalized pagerank to select academia-adjacent domains, but it isn't working very well. The hope is that filtering on domain names will be more effective, and that it can supplant the ranking-based approach.
This commit is contained in:
parent
156c067f79
commit
fc30da0d48
@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class DomainProcessor {
|
public class DomainProcessor {
|
||||||
private final DocumentProcessor documentProcessor;
|
private final DocumentProcessor documentProcessor;
|
||||||
@ -94,6 +95,10 @@ public class DomainProcessor {
|
|||||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isAcademicDomain(ret.domain)) {
|
||||||
|
terms.add("special:academia");
|
||||||
|
}
|
||||||
|
|
||||||
for (var document : ret.documents) {
|
for (var document : ret.documents) {
|
||||||
if (document.details == null)
|
if (document.details == null)
|
||||||
continue;
|
continue;
|
||||||
@ -114,6 +119,19 @@ public class DomainProcessor {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
|
||||||
|
private boolean isAcademicDomain(EdgeDomain domain) {
|
||||||
|
|
||||||
|
if (domain.domain.endsWith(".edu"))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (academicPattern.matcher(domain.domain).matches())
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private void fixBadCanonicalTag(CrawledDocument doc) {
|
private void fixBadCanonicalTag(CrawledDocument doc) {
|
||||||
// Some sites have a canonical tag that points to a different domain,
|
// Some sites have a canonical tag that points to a different domain,
|
||||||
// but our loader can not support this, so we point these back to the
|
// but our loader can not support this, so we point these back to the
|
||||||
|
Loading…
Reference in New Issue
Block a user