diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f5effebd..f9ae890c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; +import java.util.regex.Pattern; public class DomainProcessor { private final DocumentProcessor documentProcessor; @@ -94,6 +95,10 @@ public class DomainProcessor { terms.add(HtmlFeature.COOKIES.getKeyword()); } + if (isAcademicDomain(ret.domain)) { + terms.add("special:academia"); + } + for (var document : ret.documents) { if (document.details == null) continue; @@ -114,6 +119,19 @@ public class DomainProcessor { return ret; } + + private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$"); + private boolean isAcademicDomain(EdgeDomain domain) { + + if (domain.domain.endsWith(".edu")) + return true; + + if (academicPattern.matcher(domain.domain).matches()) + return true; + + return false; + } + private void fixBadCanonicalTag(CrawledDocument doc) { // Some sites have a canonical tag that points to a different domain, // but our loader can not support this, so we point these back to the