From fc30da0d480fd690e2eaf4ec1fd741b294fd9f12 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 8 Dec 2023 20:31:34 +0100 Subject: [PATCH] (converter) Add academia recognition to DomainProcessor The code now includes an additional function in the DomainProcessor class that checks if a domain is associated with academia. An academic domain is identified by the ".edu" TLD, or fits a specific regex pattern matching domains like *.ac.ccTld or *.edu.ccTld. If these conditions are met, the search term "special:academia" is added to the domain. The existing academia search filter uses personalized pagerank to select academia-adjacent domains, but it isn't working very well. The hope is that filtering on domain names will be more effective, and that it can supplant the ranking-based approach. --- .../converting/processor/DomainProcessor.java | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f5effebd..f9ae890c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; +import java.util.regex.Pattern; public class DomainProcessor { private final DocumentProcessor documentProcessor; @@ -94,6 +95,10 @@ public class DomainProcessor { terms.add(HtmlFeature.COOKIES.getKeyword()); } + if (isAcademicDomain(ret.domain)) { + terms.add("special:academia"); + } + for (var document : ret.documents) { if (document.details == null) continue; @@ -114,6 +119,19 @@ public class DomainProcessor { return ret; } + + private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$"); + private boolean isAcademicDomain(EdgeDomain domain) { + + if (domain.domain.endsWith(".edu")) + return true; + + if (academicPattern.matcher(domain.domain).matches()) + return true; + + return false; + } + private void fixBadCanonicalTag(CrawledDocument doc) { // Some sites have a canonical tag that points to a different domain, // but our loader can not support this, so we point these back to the