From 0f9b90eb1c7816942614ac6ff7f782512e609934 Mon Sep 17 00:00:00 2001 From: Viktor Date: Mon, 10 Jul 2023 17:36:12 +0200 Subject: [PATCH] Better fingerprinting (#35) * Better fingerprinting for server tech * Many more features in FeatureExtractor * Blog specialization * SiteType table --- code/common/db/build.gradle | 8 +- .../java/nu/marginalia/db/DomainTypes.java | 179 ++++++++++++++ .../resources/sql/current/10-domain-type.sql | 19 ++ .../nu/marginalia/db/DomainTypesTest.java | 63 +++++ .../marginalia/model/crawl/HtmlFeature.java | 44 +++- .../language/model/DocumentLanguageData.java | 9 + .../processes/converting-process/build.gradle | 1 + .../marginalia/converting/ConverterMain.java | 4 +- .../processor/ConverterDomainTypes.java | 53 ++++ .../logic/DocumentGeneratorExtractor.java | 123 ++++++++-- .../processor/logic/DocumentValuator.java | 5 +- .../processor/logic/FeatureExtractor.java | 227 +++++++++++++++--- .../plugin/HtmlDocumentProcessorPlugin.java | 25 +- .../specialization/BlogSpecialization.java | 210 ++++++++++++++++ .../HtmlProcessorSpecializations.java | 23 +- .../ConvertingIntegrationTestModule.java | 3 + .../BlogSpecializationTest.java | 17 ++ .../JavadocSpecializationTest.java | 2 +- .../LemmySpecializationTest.java | 4 +- .../XenForoSpecializationTest.java | 2 +- .../crawl/retreival/LinkFilterSelector.java | 3 +- .../index/svc/IndexQueryService.java | 2 +- .../marginalia/search/model/UrlDetails.java | 4 +- .../experiments/DebugConverterExperiment.java | 49 +--- 24 files changed, 963 insertions(+), 116 deletions(-) create mode 100644 code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java create mode 100644 code/common/db/src/main/resources/sql/current/10-domain-type.sql create mode 100644 code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java create mode 100644 code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecializationTest.java diff --git a/code/common/db/build.gradle b/code/common/db/build.gradle index a06d8c3e..b7e3f0ef 100644 --- a/code/common/db/build.gradle +++ b/code/common/db/build.gradle @@ -2,6 +2,7 @@ plugins { id 'java' id "io.freefair.lombok" version "5.3.3.3" id 'jvm-test-suite' + } java { @@ -32,8 +33,14 @@ dependencies { testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' } + test { maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 maxHeapSize = "8G" @@ -47,4 +54,3 @@ task fastTests(type: Test) { excludeTags "slow" } } - diff --git a/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java b/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java new file mode 100644 index 00000000..60b42030 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java @@ -0,0 +1,179 @@ +package nu.marginalia.db; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.id.EdgeIdList; +import org.slf4j.LoggerFactory; +import org.slf4j.Logger; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +/** A list of domains that are known to be of a certain type */ +@Singleton +public class DomainTypes { + + public enum Type { + BLOG, + TEST + }; + + private final Logger logger = LoggerFactory.getLogger(DomainTypes.class); + + private final HikariDataSource dataSource; + + @Inject + public DomainTypes(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + /** Get all domains of a certain type, including domains that are not in the EC_DOMAIN table */ + public List getAllDomainsByType(Type type) { + List ret = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT DOMAIN_NAME + FROM DOMAIN_SELECTION INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID + WHERE DOMAIN_SELECTION_TYPE.NAME = ? + """)) + { + stmt.setString(1, type.name()); + var rs = stmt.executeQuery(); + while (rs.next()) { + ret.add(rs.getString(1)); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return ret; + } + + /** Retrieve the EdgeId of all domains of a certain type, + * ignoring entries that are not in the EC_DOMAIN table */ + public EdgeIdList getKnownDomainsByType(Type type) { + EdgeIdList ret = new EdgeIdList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT EC_DOMAIN.ID + FROM DOMAIN_SELECTION + INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID + INNER JOIN EC_DOMAIN ON DOMAIN_SELECTION.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME + WHERE DOMAIN_SELECTION_TYPE.NAME = ? + """)) + { + stmt.setString(1, type.name()); + var rs = stmt.executeQuery(); + while (rs.next()) { + ret.add(rs.getInt(1)); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return ret; + } + + /** Reload the list of domains of a certain type from the source */ + public void reloadDomainsList(Type type) throws IOException, SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT SOURCE, ID FROM DOMAIN_SELECTION_TYPE WHERE NAME = ? + """); + var deleteStatement = conn.prepareStatement(""" + DELETE FROM DOMAIN_SELECTION WHERE DOMAIN_TYPE_ID = ? + """); + var insertStatement = conn.prepareStatement(""" + INSERT IGNORE INTO DOMAIN_SELECTION (DOMAIN_NAME, DOMAIN_TYPE_ID) VALUES (?, ?) + """) + ) + { + stmt.setString(1, type.name()); + var rsp = stmt.executeQuery(); + + if (!rsp.next()) { + throw new RuntimeException("No such domain selection type: " + type); + } + + var source = rsp.getString(1); + int typeId = rsp.getInt(2); + + List downloadDomains = downloadDomainsList(source); + + try { + conn.setAutoCommit(false); + deleteStatement.setInt(1, typeId); + deleteStatement.executeUpdate(); + + for (String domain : downloadDomains) { + insertStatement.setString(1, domain); + insertStatement.setInt(2, typeId); + insertStatement.executeUpdate(); + // Could use batch insert here, but this executes infrequently, so it's not worth the hassle + } + + conn.commit(); + } + catch (SQLException ex) { + conn.rollback(); + throw ex; + } + finally { + conn.setAutoCommit(true); + } + } + } + + private List downloadDomainsList(String source) throws IOException { + List ret = new ArrayList<>(); + + logger.info("Downloading domain list from {}", source); + + try (var br = new BufferedReader(new InputStreamReader(new URL(source).openStream()))) { + String line; + + while ((line = br.readLine()) != null) { + line = cleanDomainListLine(line); + + + if (isValidDomainListEntry(line)) + ret.add(line); + } + } + + logger.info("-- found {}", ret.size()); + + + return ret; + } + + private String cleanDomainListLine(String line) { + line = line.trim(); + + int hashIdx = line.indexOf('#'); + if (hashIdx >= 0) + line = line.substring(0, hashIdx).trim(); + + return line; + } + + private boolean isValidDomainListEntry(String line) { + if (line.isBlank()) + return false; + if (!line.matches("[a-z0-9\\-.]+")) + return false; + + return true; + } +} diff --git a/code/common/db/src/main/resources/sql/current/10-domain-type.sql b/code/common/db/src/main/resources/sql/current/10-domain-type.sql new file mode 100644 index 00000000..2011d1f6 --- /dev/null +++ b/code/common/db/src/main/resources/sql/current/10-domain-type.sql @@ -0,0 +1,19 @@ +CREATE TABLE IF NOT EXISTS DOMAIN_SELECTION_TYPE ( + ID INT PRIMARY KEY AUTO_INCREMENT, + NAME VARCHAR(255) UNIQUE, + SOURCE VARCHAR(255) NOT NULL +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_bin; + +CREATE TABLE DOMAIN_SELECTION ( + DOMAIN_NAME VARCHAR(255) PRIMARY KEY, + DOMAIN_TYPE_ID INT, + FOREIGN KEY (DOMAIN_TYPE_ID) REFERENCES DOMAIN_SELECTION_TYPE(ID) ON DELETE CASCADE +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_unicode_ci; + +INSERT IGNORE INTO DOMAIN_SELECTION_TYPE(NAME, SOURCE) +VALUES ('BLOG', 'https://raw.githubusercontent.com/MarginaliaSearch/submit-site-to-marginalia-search/master/blogs.txt'), + ('TEST', 'https://downloads.marginalia.nu/domain-list-test.txt'); \ No newline at end of file diff --git a/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java b/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java new file mode 100644 index 00000000..0829f6f5 --- /dev/null +++ b/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java @@ -0,0 +1,63 @@ +package nu.marginalia.db; + +import com.google.common.collect.Sets; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +@Testcontainers +public class DomainTypesTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/current/10-domain-type.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static DomainTypes domainTypes; + + @BeforeAll + public static void setup() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + + domainTypes = new DomainTypes(dataSource); + } + + @AfterAll + public static void teardown() { + dataSource.close(); + } + + @Test + public void reloadDomainsList() throws SQLException, IOException { + domainTypes.reloadDomainsList(DomainTypes.Type.TEST); + + var downloadedDomains = new HashSet<>(domainTypes.getAllDomainsByType(DomainTypes.Type.TEST)); + + var expectedDomains = Set.of("www.marginalia.nu", "search.marginalia.nu", + "encyclopedia.marginalia.nu", "memex.marginalia.nu"); + + assertEquals(4, downloadedDomains.size()); + assertEquals(Set.of(), Sets.symmetricDifference(expectedDomains, downloadedDomains)); + } + +} \ No newline at end of file diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java index 4bdb5ca1..d9adbff6 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java +++ b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java @@ -6,7 +6,10 @@ public enum HtmlFeature { MEDIA( "special:media"), JS("special:scripts"), AFFILIATE_LINK( "special:affiliate"), - TRACKING("special:tracking"), + TRACKING_INNOCENT("special:tracking"), + TRACKING_EVIL("special:tracking2"), + + VIEWPORT("special:viewport"), COOKIES("special:cookies"), CATEGORY_FOOD("category:food"), @@ -15,8 +18,43 @@ public enum HtmlFeature { GA_SPAM("special:gaspam"), - UNKNOWN("special:uncategorized") - ; + /** For fingerprinting and ranking */ + OPENGRAPH("special:opengraph"), + OPENGRAPH_IMAGE("special:opengraph:image"), + TWITTERCARD("special:twittercard"), + TWITTERCARD_IMAGE("special:twittercard:image"), + FONTAWSESOME("special:fontawesome"), + GOOGLEFONTS("special:googlefonts"), + DNS_PREFETCH("special:dnsprefetch"), + PRELOAD("special:preload"), + PRECONNECT("special:preconnect"), + PINGBACK("special:pingback"), + FEED("special:feed"), + WEBMENTION("special:webmention"), + INDIEAUTH("special:indieauth"), + ME_TAG("special:metag"), + NEXT_TAG("special:nexttag"), + AMPHTML("special:amphtml"), + JSON_LD("special:jsonld"), + ORIGIN_TRIAL("special:origintrial"), + PROFILE_GMPG("special:profile-gpmg"), + QUANTCAST("special:quantcast"), + COOKIELAW("special:cookielaw"), + DIDOMI("special:didomi"), + PARDOT("special:pardot"), + ONESIGNAL("special:onesignal"), + DATE_TAG("special:date_tag"), + NOSCRIPT_TAG("special:noscript_tag"), + + ROBOTS_INDEX("robots:index"), + ROBOTS_FOLLOW("robots:follow"), + ROBOTS_NOODP("robots:noodp"), + ROBOTS_NOYDIR("robots:noydir"), + DOFOLLOW_LINK("special:dofollow"), + APPLE_TOUCH_ICON("special:appleicon"), + + UNKNOWN("special:uncategorized"); + private final String keyword; diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java index a40fd637..a889ab2a 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java @@ -17,6 +17,15 @@ public class DocumentLanguageData { public final DocumentSentence[] titleSentences; public final TObjectIntHashMap wordCount; + /** for test convenience */ + public static DocumentLanguageData empty() { + return new DocumentLanguageData( + new DocumentSentence[0], + new DocumentSentence[0], + new TObjectIntHashMap<>() + ); + } + public int totalNumWords() { int ret = 0; for (int i = 0; i < sentences.length; i++) { diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 6ef9a25c..4cc4c63b 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -29,6 +29,7 @@ dependencies { implementation project(':code:api:index-api') implementation project(':code:common:model') + implementation project(':code:common:db') implementation project(':code:common:service') implementation project(':code:common:config') implementation project(':code:common:service-discovery') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 31fa4bb1..3ecebb80 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -5,6 +5,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.process.log.WorkLog; +import nu.marginalia.service.module.DatabaseModule; import plan.CrawlPlanLoader; import plan.CrawlPlan; import nu.marginalia.converting.compiler.InstructionsCompiler; @@ -33,7 +34,8 @@ public class ConverterMain { var plan = new CrawlPlanLoader().load(Path.of(args[0])); Injector injector = Guice.createInjector( - new ConverterModule(plan) + new ConverterModule(plan), + new DatabaseModule() ); injector.getInstance(ConverterMain.class); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java new file mode 100644 index 00000000..95a1b5fd --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java @@ -0,0 +1,53 @@ +package nu.marginalia.converting.processor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.db.DomainTypes; +import nu.marginalia.model.EdgeDomain; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.Map; + +/** Converter-side wrapper for of common:db's DomainTypes, + * which is a list of domains of a known type (e.g. blog) + */ +@Singleton +public class ConverterDomainTypes { + private final Logger logger = LoggerFactory.getLogger(ConverterDomainTypes.class); + private final Map domainTypes = new HashMap<>(); + + private enum DomainType { + BLOG + } + + @Inject + public ConverterDomainTypes(DomainTypes types) throws SQLException { + var allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG); + + if (allBlogs.isEmpty()) { + logger.info("No domains of type BLOG found in database, downloading list"); + try { + types.reloadDomainsList(DomainTypes.Type.BLOG); + allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG); + } + catch (IOException ex) { + logger.error("Failed to download domains list", ex); + } + } + + for (var item : allBlogs) { + domainTypes.put(new EdgeDomain(item), DomainType.BLOG); + } + + logger.info("Loaded {} domain types", domainTypes.size()); + + } + + public boolean isBlog(EdgeDomain domain) { + return domainTypes.get(domain) == DomainType.BLOG; + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index ac1c15a2..dea7cefa 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -12,13 +12,13 @@ import java.util.List; public class DocumentGeneratorExtractor { private static final String defaultValue = "unset"; - public DocumentGenerator generatorCleaned(Document doc) { + public DocumentGenerator detectGenerator(Document doc, String responseHeaders) { var tags = doc.select("meta[name=generator]"); if (tags.size() == 0) { // Some sites have a comment in the head instead of a meta tag - return fingerprintByComments(doc); + return fingerprintServerTech(doc, responseHeaders); } if (tags.size() > 1) { return DocumentGenerator.multiple(); @@ -29,11 +29,14 @@ public class DocumentGeneratorExtractor { generator = removePrefixOrSuffix(generator); if (generator.isBlank()) - return DocumentGenerator.unset(); + return fingerprintServerTech(doc, responseHeaders); + + if (generator.startsWith("AMP by WP")) + return DocumentGenerator.of("wordpress", "wordpress-amp"); String[] parts = StringUtils.split(generator, " ,:!"); if (parts.length == 0) - return DocumentGenerator.unset(); + return fingerprintServerTech(doc, responseHeaders); int slashIdx = parts[0].indexOf('/'); if (slashIdx >= 0) { @@ -42,7 +45,7 @@ public class DocumentGeneratorExtractor { } if (parts.length > 3) { - return DocumentGenerator.unset(); // if it's still very long after trim(), it's probably a custom hand written message + return fingerprintServerTech(doc, responseHeaders); // if it's still very long after trim(), it's probably a custom hand written message } switch (parts[0]) { @@ -73,7 +76,7 @@ public class DocumentGeneratorExtractor { } // Fallback logic when there is no meta tag - private DocumentGenerator fingerprintByComments(Document doc) { + private DocumentGenerator fingerprintServerTech(Document doc, String responseHeaders) { for (var comment : doc.getElementsByTag("head").comments()) { String data = comment.getData(); @@ -81,22 +84,43 @@ public class DocumentGeneratorExtractor { if (data.contains("Generated by javadoc")) { return DocumentGenerator.of("javadoc"); } - + if (data.contains("Squarespace")) { + return DocumentGenerator.of("squarespace"); + } if (data.contains("phpBB")) { return DocumentGenerator.of("phpbb"); } } for (var tag : doc.head().getElementsByTag("script")) { - if (tag.html().contains("window.lemmyConfig")) { - return DocumentGenerator.of("lemmy"); - } - if (tag.html().contains("URL_DOMAIN = 'wikidot.com'")) { - return DocumentGenerator.of("wikidot"); - } - if (tag.attr("src").contains("wp-content")) { + String scriptSrc = tag.attr("src"); + + if (scriptSrc.contains("wp-content") || scriptSrc.contains("wp-includes")) { return DocumentGenerator.of("wordpress", "wordpress-sneaky"); } + if (scriptSrc.contains("squarespace.com")) { + return DocumentGenerator.of("squarespace"); + } + if (scriptSrc.contains("cdn.cloversites.com")) { + return DocumentGenerator.of("cloversites"); + } + if (scriptSrc.contains("bndzgl.com")) { + return DocumentGenerator.of("bndzgl"); + } + if (scriptSrc.contains("editmysite.com")) { + return DocumentGenerator.of("editmysite"); + } + if (scriptSrc.contains("website-editor.net")) { + return DocumentGenerator.of("website-editor.net"); + } + String scriptHtml = tag.html(); + if (scriptHtml.contains("window.lemmyConfig")) { + return DocumentGenerator.of("lemmy"); + } + if (scriptHtml.contains("URL_DOMAIN = 'wikidot.com'")) { + return DocumentGenerator.of("wikidot"); + } + } for (var tag : doc.head().getElementsByTag("link")) { @@ -109,6 +133,10 @@ public class DocumentGeneratorExtractor { return DocumentGenerator.of("flarum"); } + if (doc.getElementById("tracpowered") != null) { + return DocumentGenerator.of("trac"); + } + if (doc.getElementById("_xfClientLoadTime") != null) { return DocumentGenerator.of("xenforo"); } @@ -117,6 +145,48 @@ public class DocumentGeneratorExtractor { return DocumentGenerator.of("invision"); } + if (doc.getElementById("___gatsby") != null) { + return DocumentGenerator.of("gatsby"); + } + + String[] headers = responseHeaders.toLowerCase().split("\n+"); + for (var header : headers) { + if (header.contains("x-drupal-cache")) { + return DocumentGenerator.of("drupal"); + } + if (header.contains("x-powered-by: asp.net")) { + return DocumentGenerator.of("asp.net"); + } + if (header.contains("x-powered-by: php")) { + return DocumentGenerator.of("php"); + } + if (header.contains("x-powered-by: wp engine")) { + return DocumentGenerator.of("wordpress", "wp-engine", "wordpress-sneaky"); + } + if (header.contains("x-powered-by: statamic")) { + return DocumentGenerator.of("laravel", "statamic"); + } + } + + // These should be all the way down as they are the most generic + for (var header : headers) { + if (header.contains("server: mastodon")) { + return DocumentGenerator.of("mastodon"); + } + if (header.contains("server: gunicorn")) { + return DocumentGenerator.of("gunicorn"); + } + if (header.contains("server: nginx")) { + return DocumentGenerator.of("nginx"); + } + if (header.contains("server: apache")) { + return DocumentGenerator.of("apache"); + } + if (header.contains("server: cowboy")) { + return DocumentGenerator.of("cowboy"); // erlang, really?! + } + } + return DocumentGenerator.unset(); } @@ -138,6 +208,11 @@ public class DocumentGeneratorExtractor { generator = generator.substring(0, dashIdx); } + int parenIdx = generator.indexOf('('); // Some strings have values like 'Drupal 9 (https://www.drupal.org)' + if (parenIdx >= 0) { + generator = generator.substring(0, parenIdx); + } + if (!StringUtils.isAsciiPrintable(generator)) return ""; @@ -170,11 +245,18 @@ public class DocumentGeneratorExtractor { final GeneratorType type = switch (parts[0]) { case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity", "modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms", - "typo3", "dotnetnuke", "cms", "coremedia", "dspace" + "typo3", "dotnetnuke", "cms", "coremedia", "dspace", "laravel", "trac", "bunnypress", "astro", + "ghost", "publii" -> GeneratorType.CMS; case "wix.com", "one.com", "wpbakery", "claris", "wordpress.com", "hubspot", "visual_composer", "mobirise", "everweb", "rapidweaver", "shorthand", - "visual", "nitropack", + "visual", "nitropack", "squarespace", "editmysite", "websiteeditor.net", + + "svbtle.com", "write.as", "montaigne.io", // blogging platforms, maybe should be in another category? + + "cloversites", // clover is a church-oriented website builder, found that kinda neat + "bndzgl", // band websites ..? + /* these are not SAAS but close enough */ "redux", "bootply" -> GeneratorType.SAAS; @@ -185,7 +267,8 @@ public class DocumentGeneratorExtractor { "pdf2htmlex", "nvu", "mozilla", "golive", "tenfingers", "publisher", "allaire", "neooffice" -> GeneratorType.BOOMER_STATIC; - case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome" + case "hugo", "jekyll", "hakyll", "nikola", "zola", "olivetti", "pelican", "sushy", "hexo", "eleventy", + "gridsome", "vuepress", "docusaurus", "docpad", "techou", "quarto", "soupault" -> GeneratorType.ZOOMER_STATIC; case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano", "notepad.exe", "gedit", "me", @@ -198,9 +281,9 @@ public class DocumentGeneratorExtractor { -> GeneratorType.FORUM; case "mediawiki", "dokuwiki", "wikidot", "sharepoint" -> GeneratorType.WIKI; - case "pandoc", "mkdocs", "doxygen", "javadoc" + case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc" -> GeneratorType.DOCS; - case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic" + case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass" -> GeneratorType.ECOMMERCE_AND_SPAM; default -> GeneratorType.UNKNOWN; @@ -216,7 +299,7 @@ public class DocumentGeneratorExtractor { public static DocumentGenerator multiple() { // It's *generally* WordPress or the like that injects multiple generator tags - return new DocumentGenerator(GeneratorType.CMS, List.of(defaultValue)); + return new DocumentGenerator(GeneratorType.CMS, List.of("wordpress", "wp-best-guess")); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 9de7af57..91003172 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -19,14 +19,13 @@ public class DocumentValuator { int textLength) throws DisqualifiedException { double scriptPenalty = getScriptPenalty(parsedDocument); - int textBodyLength = textLength; int rawLength = crawledDocument.documentBody.length(); - if (textBodyLength == 0) { + if (textLength == 0) { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH); } - return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale + return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale + htmlStandard.offset - scriptPenalty; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 2ea690f1..57a98879 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -21,25 +21,29 @@ import java.util.Set; @Singleton public class FeatureExtractor { - private static final List trackers = List.of("adform.net", + private static final List innocentTrackers = List.of( + "twitter.com", + "bing.com", + "msn.com"); + private static final List shittyTrackers = List.of("adform.net", "connect.facebook", + "facebook.com/tr", "googletagmanager.com", "googlesyndication.com", - "google.com", - "twitter.com", "smartadserver.com", "doubleclick.com", "2mdn.com", "dmtry.com", - "bing.com", - "msn.com", "amazon-adsystem.com", "alexametrics.com", "rubiconproject.com", "chango.com", "d5nxst8fruw4z.cloudfront.net", "d31qbv1cthcecs.cloudfront.net", - "linkedin.com"); + "linkedin.com", + "perfectaudience.com", + "marketingautomation.services", + "usefathom"); private final AdblockSimulator adblockSimulator; private final RecipeDetector recipeDetector; @@ -71,21 +75,119 @@ public class FeatureExtractor { } for (var scriptTag : scriptTags) { - if (isJavascriptTag(scriptTag)) { + final String type = scriptTag.attr("type"); + + if ("application/ld+json".equalsIgnoreCase(type)) { + features.add(HtmlFeature.JSON_LD); + } + else { features.add(HtmlFeature.JS); } } - // 500 IQ web developers use error or load handlers - // sneakily load JS without explicit script tags - for (var link : doc.head().getElementsByTag("link")) { - if (link.hasAttr("onerror")) { - features.add(HtmlFeature.JS); - break; + if (!doc.head().getElementsByTag("viewport").isEmpty()) { + features.add(HtmlFeature.VIEWPORT); + } + for (var atag : doc.body().getElementsByTag("a")) { + var rel = atag.attr("rel"); + if (rel.equals("dofollow")) { + features.add(HtmlFeature.DOFOLLOW_LINK); } - if (link.hasAttr("onload")) { + } + + if (!doc.getElementsByTag("date").isEmpty()) { + features.add(HtmlFeature.DATE_TAG); + } + if (!doc.getElementsByTag("noscript").isEmpty()) { + features.add(HtmlFeature.NOSCRIPT_TAG); + } + + + for (var link : doc.head().getElementsByTag("link")) { + + // 500 IQ web developers use error or load handlers + // sneakily load JS without explicit script tags + if (link.hasAttr("onerror")) features.add(HtmlFeature.JS); - break; + if (link.hasAttr("onload")) + features.add(HtmlFeature.JS); + + if (link.hasAttr("pingback")) { + features.add(HtmlFeature.PINGBACK); + } + + + var href = link.attr("href"); + + if (href.contains("indieauth")) + features.add(HtmlFeature.INDIEAUTH); + + var rel = link.attr("rel"); + + if (rel.equals("webmention")) + features.add(HtmlFeature.WEBMENTION); + + if (rel.equals("me")) + features.add(HtmlFeature.ME_TAG); + + if (rel.equals("next")) + features.add(HtmlFeature.NEXT_TAG); + + if (rel.equals("alternate") && link.hasAttr("type")) + features.add(HtmlFeature.FEED); + + if (rel.equals("dns-prefetch")) + features.add(HtmlFeature.DNS_PREFETCH); + + if (rel.equals("preload")) + features.add(HtmlFeature.PRELOAD); + + if (rel.equals("preconnect")) + features.add(HtmlFeature.PRECONNECT); + + if (rel.equals("amphtml")) + features.add(HtmlFeature.AMPHTML); + + if (rel.equals("apple-touch-icon")) + features.add(HtmlFeature.APPLE_TOUCH_ICON); + + } + + for (var meta : doc.head().getElementsByTag("meta")) { + // + if (meta.attr("name").equals("robots")) { + var content = meta.attr("content"); + if (!content.contains("noindex") && content.contains("index")) { + features.add(HtmlFeature.ROBOTS_INDEX); + } + if (!content.contains("nofollow") && content.contains("follow")) { + features.add(HtmlFeature.ROBOTS_FOLLOW); + } + if (content.contains("noodp")) { + features.add(HtmlFeature.ROBOTS_NOODP); + } + if (content.contains("noydir")) { + features.add(HtmlFeature.ROBOTS_NOYDIR); + } + } + + if (meta.attr("profile").contains("gmpg")) { + features.add(HtmlFeature.PROFILE_GMPG); + } + if (meta.attr("property").equals("og:description")) { + features.add(HtmlFeature.OPENGRAPH); + } + if (meta.attr("property").equals("og:image")) { + features.add(HtmlFeature.OPENGRAPH_IMAGE); + } + if (meta.attr("name").equals("twitter:description")) { + features.add(HtmlFeature.TWITTERCARD); + } + if (meta.attr("name").equals("twitter:image")) { + features.add(HtmlFeature.TWITTERCARD_IMAGE); + } + if (meta.attr("http-equiv").equals("origin-trial")) { + features.add(HtmlFeature.ORIGIN_TRIAL); } } @@ -100,14 +202,74 @@ public class FeatureExtractor { } for (var scriptTag : scriptTags) { - if (hasTrackingScript(scriptTag)) { - features.add(HtmlFeature.TRACKING); - break; + if (hasInvasiveTrackingScript(scriptTag)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING_EVIL); + } + else if (hasNaiveTrackingScript(scriptTag)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + } + + if (scriptTag.hasAttr("didomi/javascript")) { + features.add(HtmlFeature.DIDOMI); + } + + String src = scriptTag.attr("src"); + if (src.contains("OneSignalSDK")) { + features.add(HtmlFeature.ONESIGNAL); + } + + String scriptText = scriptTag.html(); + + if (scriptText.contains("'pd.js'")) { + features.add(HtmlFeature.PARDOT); + } + if (scriptText.contains("https://cmp.quantcast.com")) { + features.add(HtmlFeature.QUANTCAST); + } + if (scriptText.contains("https://quantcast.mgr.consensu.org")) { + features.add(HtmlFeature.QUANTCAST); + } + if (scriptText.contains("https://cdn.cookielaw.org")) { + features.add(HtmlFeature.COOKIELAW); + } + if (scriptText.contains("_linkedin_data_partner_id")) { + features.add(HtmlFeature.TRACKING_EVIL); + } + if (scriptText.contains("window.OneSignal")) { + features.add(HtmlFeature.ONESIGNAL); + } + if (scriptText.contains("connect.facebook.net")) { + features.add(HtmlFeature.TRACKING_EVIL); + } + if (scriptText.contains("hotjar.com")) { + features.add(HtmlFeature.TRACKING_INNOCENT); + } + } + + for (var noscript : doc.getElementsByTag("noscript")) { + for (var iframe : noscript.getElementsByTag("iframe")) { + if (hasInvasiveTrackingScript(iframe)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING_EVIL); + } + else if (hasNaiveTrackingScript(iframe)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + } + } + for (var img : noscript.getElementsByTag("img")) { + if (hasInvasiveTrackingScript(img)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING_EVIL); + } + else if (hasNaiveTrackingScript(img)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + } } } if (scriptTags.html().contains("google-analytics.com")) { - features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_INNOCENT); } for (var aTag : doc.getElementsByTag("a")) { @@ -129,30 +291,33 @@ public class FeatureExtractor { return features; } - private boolean hasTrackingScript(Element scriptTag) { - return hasTrackingScript(scriptTag.attr("src")); + private boolean hasInvasiveTrackingScript(Element scriptTag) { + return hasInvasiveTrackingScript(scriptTag.attr("src")); } + private boolean hasNaiveTrackingScript(Element scriptTag) { + return hasNaiveTrackingScript(scriptTag.attr("src")); + } + private boolean hasInvasiveTrackingScript(String src) { - private boolean hasTrackingScript(String scriptText) { - - for (var tracker : trackers) { - if (scriptText.contains(tracker)) { + for (var tracker : shittyTrackers) { + if (src.contains(tracker)) { return true; } } return false; } - private boolean isJavascriptTag(Element scriptTag) { - final String type = scriptTag.attr("type"); + private boolean hasNaiveTrackingScript(String src) { - if ("application/ld+json".equalsIgnoreCase(type)) { - return false; + for (var tracker : innocentTrackers) { + if (src.contains(tracker)) { + return true; + } } - - return true; + return false; } + boolean isAmazonAffiliateLink(Element aTag) { final String href = aTag.attr("href").toLowerCase(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 15163c6c..c2119688 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -111,9 +111,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin final EdgeUrl url = new EdgeUrl(crawledDocument.url); - final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc); + final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, crawledDocument.headers); - final var specialization = htmlProcessorSpecializations.select(generatorParts); + final var specialization = htmlProcessorSpecializations.select(generatorParts, url); if (!specialization.shouldIndex(url)) { throw new DisqualifiedException(DisqualificationReason.IRRELEVANT); @@ -167,7 +167,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin .addGenerator(generatorParts.keywords()) .build(); + words.addAllSyntheticTerms(tagWords); + specialization.amendWords(doc, words); getLinks(url, ret, doc, words); @@ -216,8 +218,23 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin return true; } - // Annoying wordpress crap - if (url.path.startsWith("/tag/") && url.path.endsWith("/")) { + // Annoying blog crap + if (url.path.contains("/tag/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/tags/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/category/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/categories/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/section/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/sections/") && url.path.endsWith("/")) { return true; } return false; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java new file mode 100644 index 00000000..f40654bc --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java @@ -0,0 +1,210 @@ +package nu.marginalia.converting.processor.plugin.specialization; + +import ca.rmen.porterstemmer.PorterStemmer; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.summary.SummaryExtractor; +import org.apache.logging.log4j.util.Strings; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.NodeFilter; +import org.jsoup.select.NodeVisitor; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Predicate; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** The blog specialization is used for blogs, and makes heavy assumptions about the nature of the document + * that aren't generally true, but if the categorization is correct, will yield much better results. + */ +@Singleton +public class BlogSpecialization extends DefaultSpecialization { + + @Inject + public BlogSpecialization(SummaryExtractor summaryExtractor) { + super(summaryExtractor); + } + + @Override + public Document prune(Document original) { + var doc = original.clone(); + + // Remove all nav junk, comments and other stuff + doc.filter(new BlogPruningFilter()); + + // If there is an article tag, use that as the root + var articleTags = doc.getElementsByTag("article"); + var firstArticle = articleTags.first(); + if (firstArticle != null) { + var art = firstArticle.clone(); + + doc.body().empty(); + doc.body().appendChild(art); + + return doc; + } + + // Use the default pruning as a fallback + return super.prune(doc); + } + + @Override + public String getSummary(Document original, Set importantWords) { + return super.getSummary(original, importantWords); + } + + private final static List badPathElements = + List.of("/tag/", "/tags/", "/tagged/", "/category/", "/categories/", "/section/", "/sections/", "/page/", "/author/"); + + private final static Predicate dateIndexTest1 = Pattern.compile("^/(\\d{4}/(\\d{2}/){0,2}?)$").asMatchPredicate(); + private final static Predicate dateIndexTest2 = Pattern.compile("^/(\\d{2}/){1,2}$").asMatchPredicate(); + + @Override + public boolean shouldIndex(EdgeUrl url) { + String path = url.path; + + // Don't index the root path for blogs, as it is usually an ephemeral list of all posts + if ("/".equals(path)) return false; + + // Likewise for the blog's home page + if (path.endsWith("/blog/")) return false; + if (path.endsWith("/log/")) return false; + if (path.endsWith("/weblog/")) return false; + if (path.endsWith("/posts/")) return false; + if (path.endsWith("/articles/")) return false; + + // Refuse paths that contain any of the bad path elements + for (String badPathElement : badPathElements) { + if (path.contains(badPathElement)) return false; + } + + // We don't want chronological listings + if (dateIndexTest1.test(path)) return false; + if (dateIndexTest2.test(path)) return false; + + return true; + } + + private static PorterStemmer ps = new PorterStemmer(); + public void amendWords(Document doc, DocumentKeywordsBuilder words) { + var tagExtractor = new BlogTagExtractor(); + doc.traverse(tagExtractor); + + var tags = tagExtractor.getTags(); + if (!tags.isEmpty()) { + var stemmed = tags.stream().map(ps::stemWord).collect(Collectors.toSet()); + words.setFlagOnMetadataForWords(WordFlags.Subjects, stemmed); + + Set specialTags = tags.stream().map(s -> "tag:" + s).collect(Collectors.toSet()); + words.addAllSyntheticTerms(specialTags); + } + + } + + /** Removes all the non-content elements from the document, + * making strong blog-specific assumptions about the nature of + * the layout */ + private static class BlogPruningFilter implements NodeFilter { + private static final List badClassElements = Arrays.asList("comment", "reply", "sidebar", "header", "footer", "nav"); + private static final List badIdElements = Arrays.asList("comments", "header", "footer", "nav"); + + @Override + public FilterResult head(Node node, int depth) { + if (node instanceof Element el) { + String classes = el.attr("class"); + String id = el.id(); + + for (String badClassElement : badClassElements) { + if (classes.contains(badClassElement)) { + return FilterResult.REMOVE; + } + } + for (String badIdElement : badIdElements) { + if (id.contains(badIdElement)) { + return FilterResult.REMOVE; + } + } + } + return FilterResult.CONTINUE; + } + } + + + // Extract tag keywords from the blog post + public static class BlogTagExtractor implements NodeVisitor { + private final Set tags = new HashSet<>(); + int lookForTags = -1; + + public Set getTags() { + Set tagsClean = tags.stream().map(String::toLowerCase).map(this::cleanTag).filter(Strings::isNotBlank).collect(Collectors.toSet()); + + // If there are more than 5 tags, it's probably a global tag listing + // and not a post-specific tag listing + if (tagsClean.size() > 5) + return Set.of(); + + return tagsClean; + } + + private final Pattern splitterPattern = Pattern.compile("\\s+"); + private final Pattern noisePattern = Pattern.compile("[^a-zA-Z0-9]"); + + // This is hideously expensive but blog posts are relatively few and far between + private String cleanTag(String tag) { + + String[] parts = splitterPattern.split(tag); + + if (parts.length > 3) + return ""; + + for (int i = 0; i < parts.length; i++) { + if (parts[i].startsWith("#")) + parts[i] = parts[i].substring(1); + else if (parts[i].startsWith("(") && parts[i].endsWith(")")) + parts[i] = ""; + else + parts[i] = noisePattern.matcher(parts[i]).replaceAll(""); + + if (parts[i].equals("tags")) + parts[i] = ""; + } + + + return Arrays.stream(parts).filter(Strings::isNotBlank).collect(Collectors.joining("_")); + } + + @Override + public void head(Node node, int depth) { + + if (!(node instanceof Element el)) { + return; + } + + if (lookForTags < 0) { + if (el.attr("class").contains("tags")) { + lookForTags = depth; + } + if (el.tagName().equals("a")) { + if (el.attr("class").contains("tag") + || el.attr("href").startsWith("/tag/")) + tags.add(el.text()); + } + } + else if (el.tagName().equals("a")) { + tags.add(el.text()); + } + + } + public void tail(Node node, int depth) { + if (depth <= lookForTags) { lookForTags = -1; } + } + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java index dab6df24..b64c1dde 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java @@ -2,7 +2,9 @@ package nu.marginalia.converting.processor.plugin.specialization; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.converting.processor.ConverterDomainTypes; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; @@ -10,27 +12,41 @@ import java.util.Set; @Singleton public class HtmlProcessorSpecializations { + private final ConverterDomainTypes domainTypes; private final LemmySpecialization lemmySpecialization; private final XenForoSpecialization xenforoSpecialization; private final PhpBBSpecialization phpBBSpecialization; private final JavadocSpecialization javadocSpecialization; + private final BlogSpecialization blogSpecialization; private final DefaultSpecialization defaultSpecialization; @Inject - public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization, + public HtmlProcessorSpecializations(ConverterDomainTypes domainTypes, + LemmySpecialization lemmySpecialization, XenForoSpecialization xenforoSpecialization, PhpBBSpecialization phpBBSpecialization, JavadocSpecialization javadocSpecialization, + BlogSpecialization blogSpecialization, DefaultSpecialization defaultSpecialization) { + this.domainTypes = domainTypes; this.lemmySpecialization = lemmySpecialization; this.xenforoSpecialization = xenforoSpecialization; this.phpBBSpecialization = phpBBSpecialization; this.javadocSpecialization = javadocSpecialization; + this.blogSpecialization = blogSpecialization; this.defaultSpecialization = defaultSpecialization; } /** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */ - public HtmlProcessorSpecializationIf select(DocumentGeneratorExtractor.DocumentGenerator generator) { + public HtmlProcessorSpecializationIf select( + DocumentGeneratorExtractor.DocumentGenerator generator, + EdgeUrl url) + { + + if (domainTypes.isBlog(url.domain)) { + return blogSpecialization; + } + if (generator.keywords().contains("lemmy")) { return lemmySpecialization; } @@ -58,5 +74,8 @@ public class HtmlProcessorSpecializations { default boolean shouldIndex(EdgeUrl url) { return true; } default double lengthModifier() { return 1.0; } + + default void amendWords(Document doc, DocumentKeywordsBuilder words) {} + } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java index f92b1bc9..4471e4d1 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java @@ -4,6 +4,8 @@ import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.ConverterDomainTypes; +import org.mockito.Mockito; public class ConvertingIntegrationTestModule extends AbstractModule { public void configure() { @@ -13,5 +15,6 @@ public class ConvertingIntegrationTestModule extends AbstractModule { bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); + bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class)); } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecializationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecializationTest.java new file mode 100644 index 00000000..4ec49f14 --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecializationTest.java @@ -0,0 +1,17 @@ +package nu.marginalia.converting.processor.plugin.specialization; + +import nu.marginalia.model.EdgeUrl; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class BlogSpecializationTest { + + @Test + void shouldIndex() throws Exception { + var spec = new BlogSpecialization(null); + assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/22/"))); + assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/"))); + assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/00/22/"))); + } +} \ No newline at end of file diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java index 823b92f8..355921ea 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java @@ -34,7 +34,7 @@ class JavadocSpecializationTest { @Test void generatorExtraction() { - var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread)); + var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), ""); System.out.println(gen); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java index f89abd17..7aab1759 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java @@ -37,8 +37,8 @@ class LemmySpecializationTest { @Test void generatorExtraction() { - var generatorIndex = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyIndexHtml)); - var generatorPost = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyPost)); + var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), ""); + var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), ""); System.out.println(generatorIndex); System.out.println(generatorPost); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java index a10e3ca0..40914ba8 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java @@ -34,7 +34,7 @@ class XenForoSpecializationTest { @Test void generatorExtraction() { - var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread)); + var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), ""); System.out.println(gen); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java index fcb70166..e3b5f998 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java @@ -20,7 +20,8 @@ public class LinkFilterSelector { } if (isLemmy(head)) { - return url -> url.path.startsWith("/post/") || url.path.startsWith("/c/"); + return url -> url.path.startsWith("/post/") + || (url.path.startsWith("/c/") && !url.path.contains("@")); } if (isDiscourse(head)) { return url -> url.path.startsWith("/t/") || url.path.contains("/latest"); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index 37e7bf62..c100388e 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -211,7 +211,7 @@ public class IndexQueryService { return switch (priority) { case BEST -> false; case GOOD -> resultCount > params.fetchSize / 4; - case FALLBACK -> resultCount > params.fetchSize / 256; + case FALLBACK -> resultCount > params.fetchSize / 8; }; } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index 6a89483b..b44f2551 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -121,7 +121,7 @@ public class UrlDetails { for (var problem :EnumSet.of( HtmlFeature.JS, - HtmlFeature.TRACKING, + HtmlFeature.TRACKING_INNOCENT, HtmlFeature.AFFILIATE_LINK, HtmlFeature.COOKIES, HtmlFeature.ADVERTISEMENT)) { @@ -156,7 +156,7 @@ public class UrlDetails { return HtmlFeature.hasFeature(features, HtmlFeature.JS); } public boolean isTracking() { - return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING); + return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT); } public boolean isAffiliate() { return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java index fb8c536d..452be709 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java @@ -1,18 +1,12 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; -import nu.marginalia.converting.model.GeneratorType; -import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; +import nu.marginalia.converting.processor.plugin.specialization.BlogSpecialization; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.tools.Experiment; -import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; -import java.util.HashSet; -import java.util.Set; - public class DebugConverterExperiment extends Experiment { @@ -24,56 +18,25 @@ public class DebugConverterExperiment extends Experiment { } - Set seenGenerators = new HashSet<>(); - @Override public boolean process(CrawledDomain domain) { if (domain.doc == null) return true; - var dge = new DocumentGeneratorExtractor(); - for (var doc : domain.doc) { if (doc.documentBody == null) continue; var parsed = Jsoup.parse(doc.documentBody.decode()); - parsed.getElementsByTag("head").comments() - .stream().filter(c -> { - String data = c.getData(); - if (data.contains("" + generators.type()); - if (generators.type() == GeneratorType.UNKNOWN) { - System.out.println(parsed.select("meta[name=generator]") - .attr("content")); - System.out.println(doc.url); - } - } + var tagExtractor = new BlogSpecialization.BlogTagExtractor(); + parsed.traverse(tagExtractor); + var tags = tagExtractor.getTags(); + if (!tags.isEmpty()) { + System.out.println(tags); } } -// -// var ret = domainProcessor.process(domain); -// -// -// ret.documents.stream() -// .filter(ProcessedDocument::isProcessedFully) -// .peek(d -> System.out.println(d.url)) -// .map(d -> d.details.metadata) -// .forEach(System.out::println); - return true; }