Better fingerprinting (#35)

* Better fingerprinting for server tech
* Many more features in FeatureExtractor
* Blog specialization
* SiteType table
This commit is contained in:
Viktor 2023-07-10 17:36:12 +02:00 committed by GitHub
parent ae9537b68e
commit 0f9b90eb1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 963 additions and 116 deletions

View File

@ -2,6 +2,7 @@ plugins {
id 'java' id 'java'
id "io.freefair.lombok" version "5.3.3.3" id "io.freefair.lombok" version "5.3.3.3"
id 'jvm-test-suite' id 'jvm-test-suite'
} }
java { java {
@ -32,8 +33,14 @@ dependencies {
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit
testImplementation libs.mockito testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
} }
test { test {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G" maxHeapSize = "8G"
@ -47,4 +54,3 @@ task fastTests(type: Test) {
excludeTags "slow" excludeTags "slow"
} }
} }

View File

@ -0,0 +1,179 @@
package nu.marginalia.db;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeIdList;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
/** A list of domains that are known to be of a certain type */
@Singleton
public class DomainTypes {
public enum Type {
BLOG,
TEST
};
private final Logger logger = LoggerFactory.getLogger(DomainTypes.class);
private final HikariDataSource dataSource;
@Inject
public DomainTypes(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
/** Get all domains of a certain type, including domains that are not in the EC_DOMAIN table */
public List<String> getAllDomainsByType(Type type) {
List<String> ret = new ArrayList<>();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT DOMAIN_NAME
FROM DOMAIN_SELECTION INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID
WHERE DOMAIN_SELECTION_TYPE.NAME = ?
"""))
{
stmt.setString(1, type.name());
var rs = stmt.executeQuery();
while (rs.next()) {
ret.add(rs.getString(1));
}
}
catch (SQLException ex) {
throw new RuntimeException(ex);
}
return ret;
}
/** Retrieve the EdgeId of all domains of a certain type,
* ignoring entries that are not in the EC_DOMAIN table */
public EdgeIdList<EdgeDomain> getKnownDomainsByType(Type type) {
EdgeIdList<EdgeDomain> ret = new EdgeIdList<>();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT EC_DOMAIN.ID
FROM DOMAIN_SELECTION
INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID
INNER JOIN EC_DOMAIN ON DOMAIN_SELECTION.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
WHERE DOMAIN_SELECTION_TYPE.NAME = ?
"""))
{
stmt.setString(1, type.name());
var rs = stmt.executeQuery();
while (rs.next()) {
ret.add(rs.getInt(1));
}
}
catch (SQLException ex) {
throw new RuntimeException(ex);
}
return ret;
}
/** Reload the list of domains of a certain type from the source */
public void reloadDomainsList(Type type) throws IOException, SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT SOURCE, ID FROM DOMAIN_SELECTION_TYPE WHERE NAME = ?
""");
var deleteStatement = conn.prepareStatement("""
DELETE FROM DOMAIN_SELECTION WHERE DOMAIN_TYPE_ID = ?
""");
var insertStatement = conn.prepareStatement("""
INSERT IGNORE INTO DOMAIN_SELECTION (DOMAIN_NAME, DOMAIN_TYPE_ID) VALUES (?, ?)
""")
)
{
stmt.setString(1, type.name());
var rsp = stmt.executeQuery();
if (!rsp.next()) {
throw new RuntimeException("No such domain selection type: " + type);
}
var source = rsp.getString(1);
int typeId = rsp.getInt(2);
List<String> downloadDomains = downloadDomainsList(source);
try {
conn.setAutoCommit(false);
deleteStatement.setInt(1, typeId);
deleteStatement.executeUpdate();
for (String domain : downloadDomains) {
insertStatement.setString(1, domain);
insertStatement.setInt(2, typeId);
insertStatement.executeUpdate();
// Could use batch insert here, but this executes infrequently, so it's not worth the hassle
}
conn.commit();
}
catch (SQLException ex) {
conn.rollback();
throw ex;
}
finally {
conn.setAutoCommit(true);
}
}
}
private List<String> downloadDomainsList(String source) throws IOException {
List<String> ret = new ArrayList<>();
logger.info("Downloading domain list from {}", source);
try (var br = new BufferedReader(new InputStreamReader(new URL(source).openStream()))) {
String line;
while ((line = br.readLine()) != null) {
line = cleanDomainListLine(line);
if (isValidDomainListEntry(line))
ret.add(line);
}
}
logger.info("-- found {}", ret.size());
return ret;
}
private String cleanDomainListLine(String line) {
line = line.trim();
int hashIdx = line.indexOf('#');
if (hashIdx >= 0)
line = line.substring(0, hashIdx).trim();
return line;
}
private boolean isValidDomainListEntry(String line) {
if (line.isBlank())
return false;
if (!line.matches("[a-z0-9\\-.]+"))
return false;
return true;
}
}

View File

@ -0,0 +1,19 @@
CREATE TABLE IF NOT EXISTS DOMAIN_SELECTION_TYPE (
ID INT PRIMARY KEY AUTO_INCREMENT,
NAME VARCHAR(255) UNIQUE,
SOURCE VARCHAR(255) NOT NULL
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_bin;
CREATE TABLE DOMAIN_SELECTION (
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
DOMAIN_TYPE_ID INT,
FOREIGN KEY (DOMAIN_TYPE_ID) REFERENCES DOMAIN_SELECTION_TYPE(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
INSERT IGNORE INTO DOMAIN_SELECTION_TYPE(NAME, SOURCE)
VALUES ('BLOG', 'https://raw.githubusercontent.com/MarginaliaSearch/submit-site-to-marginalia-search/master/blogs.txt'),
('TEST', 'https://downloads.marginalia.nu/domain-list-test.txt');

View File

@ -0,0 +1,63 @@
package nu.marginalia.db;
import com.google.common.collect.Sets;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Testcontainers
public class DomainTypesTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("sql/current/10-domain-type.sql")
.withNetworkAliases("mariadb");
static HikariDataSource dataSource;
static DomainTypes domainTypes;
@BeforeAll
public static void setup() {
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
domainTypes = new DomainTypes(dataSource);
}
@AfterAll
public static void teardown() {
dataSource.close();
}
@Test
public void reloadDomainsList() throws SQLException, IOException {
domainTypes.reloadDomainsList(DomainTypes.Type.TEST);
var downloadedDomains = new HashSet<>(domainTypes.getAllDomainsByType(DomainTypes.Type.TEST));
var expectedDomains = Set.of("www.marginalia.nu", "search.marginalia.nu",
"encyclopedia.marginalia.nu", "memex.marginalia.nu");
assertEquals(4, downloadedDomains.size());
assertEquals(Set.of(), Sets.symmetricDifference(expectedDomains, downloadedDomains));
}
}

View File

@ -6,7 +6,10 @@ public enum HtmlFeature {
MEDIA( "special:media"), MEDIA( "special:media"),
JS("special:scripts"), JS("special:scripts"),
AFFILIATE_LINK( "special:affiliate"), AFFILIATE_LINK( "special:affiliate"),
TRACKING("special:tracking"), TRACKING_INNOCENT("special:tracking"),
TRACKING_EVIL("special:tracking2"),
VIEWPORT("special:viewport"),
COOKIES("special:cookies"), COOKIES("special:cookies"),
CATEGORY_FOOD("category:food"), CATEGORY_FOOD("category:food"),
@ -15,8 +18,43 @@ public enum HtmlFeature {
GA_SPAM("special:gaspam"), GA_SPAM("special:gaspam"),
UNKNOWN("special:uncategorized") /** For fingerprinting and ranking */
; OPENGRAPH("special:opengraph"),
OPENGRAPH_IMAGE("special:opengraph:image"),
TWITTERCARD("special:twittercard"),
TWITTERCARD_IMAGE("special:twittercard:image"),
FONTAWSESOME("special:fontawesome"),
GOOGLEFONTS("special:googlefonts"),
DNS_PREFETCH("special:dnsprefetch"),
PRELOAD("special:preload"),
PRECONNECT("special:preconnect"),
PINGBACK("special:pingback"),
FEED("special:feed"),
WEBMENTION("special:webmention"),
INDIEAUTH("special:indieauth"),
ME_TAG("special:metag"),
NEXT_TAG("special:nexttag"),
AMPHTML("special:amphtml"),
JSON_LD("special:jsonld"),
ORIGIN_TRIAL("special:origintrial"),
PROFILE_GMPG("special:profile-gpmg"),
QUANTCAST("special:quantcast"),
COOKIELAW("special:cookielaw"),
DIDOMI("special:didomi"),
PARDOT("special:pardot"),
ONESIGNAL("special:onesignal"),
DATE_TAG("special:date_tag"),
NOSCRIPT_TAG("special:noscript_tag"),
ROBOTS_INDEX("robots:index"),
ROBOTS_FOLLOW("robots:follow"),
ROBOTS_NOODP("robots:noodp"),
ROBOTS_NOYDIR("robots:noydir"),
DOFOLLOW_LINK("special:dofollow"),
APPLE_TOUCH_ICON("special:appleicon"),
UNKNOWN("special:uncategorized");
private final String keyword; private final String keyword;

View File

@ -17,6 +17,15 @@ public class DocumentLanguageData {
public final DocumentSentence[] titleSentences; public final DocumentSentence[] titleSentences;
public final TObjectIntHashMap<String> wordCount; public final TObjectIntHashMap<String> wordCount;
/** for test convenience */
public static DocumentLanguageData empty() {
return new DocumentLanguageData(
new DocumentSentence[0],
new DocumentSentence[0],
new TObjectIntHashMap<>()
);
}
public int totalNumWords() { public int totalNumWords() {
int ret = 0; int ret = 0;
for (int i = 0; i < sentences.length; i++) { for (int i = 0; i < sentences.length; i++) {

View File

@ -29,6 +29,7 @@ dependencies {
implementation project(':code:api:index-api') implementation project(':code:api:index-api')
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:common:service') implementation project(':code:common:service')
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:common:service-discovery') implementation project(':code:common:service-discovery')

View File

@ -5,6 +5,7 @@ import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Injector; import com.google.inject.Injector;
import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLog;
import nu.marginalia.service.module.DatabaseModule;
import plan.CrawlPlanLoader; import plan.CrawlPlanLoader;
import plan.CrawlPlan; import plan.CrawlPlan;
import nu.marginalia.converting.compiler.InstructionsCompiler; import nu.marginalia.converting.compiler.InstructionsCompiler;
@ -33,7 +34,8 @@ public class ConverterMain {
var plan = new CrawlPlanLoader().load(Path.of(args[0])); var plan = new CrawlPlanLoader().load(Path.of(args[0]));
Injector injector = Guice.createInjector( Injector injector = Guice.createInjector(
new ConverterModule(plan) new ConverterModule(plan),
new DatabaseModule()
); );
injector.getInstance(ConverterMain.class); injector.getInstance(ConverterMain.class);

View File

@ -0,0 +1,53 @@
package nu.marginalia.converting.processor;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.db.DomainTypes;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Map;
/** Converter-side wrapper for of common:db's DomainTypes,
* which is a list of domains of a known type (e.g. blog)
*/
@Singleton
public class ConverterDomainTypes {
private final Logger logger = LoggerFactory.getLogger(ConverterDomainTypes.class);
private final Map<EdgeDomain, DomainType> domainTypes = new HashMap<>();
private enum DomainType {
BLOG
}
@Inject
public ConverterDomainTypes(DomainTypes types) throws SQLException {
var allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG);
if (allBlogs.isEmpty()) {
logger.info("No domains of type BLOG found in database, downloading list");
try {
types.reloadDomainsList(DomainTypes.Type.BLOG);
allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG);
}
catch (IOException ex) {
logger.error("Failed to download domains list", ex);
}
}
for (var item : allBlogs) {
domainTypes.put(new EdgeDomain(item), DomainType.BLOG);
}
logger.info("Loaded {} domain types", domainTypes.size());
}
public boolean isBlog(EdgeDomain domain) {
return domainTypes.get(domain) == DomainType.BLOG;
}
}

View File

@ -12,13 +12,13 @@ import java.util.List;
public class DocumentGeneratorExtractor { public class DocumentGeneratorExtractor {
private static final String defaultValue = "unset"; private static final String defaultValue = "unset";
public DocumentGenerator generatorCleaned(Document doc) { public DocumentGenerator detectGenerator(Document doc, String responseHeaders) {
var tags = doc.select("meta[name=generator]"); var tags = doc.select("meta[name=generator]");
if (tags.size() == 0) { if (tags.size() == 0) {
// Some sites have a comment in the head instead of a meta tag // Some sites have a comment in the head instead of a meta tag
return fingerprintByComments(doc); return fingerprintServerTech(doc, responseHeaders);
} }
if (tags.size() > 1) { if (tags.size() > 1) {
return DocumentGenerator.multiple(); return DocumentGenerator.multiple();
@ -29,11 +29,14 @@ public class DocumentGeneratorExtractor {
generator = removePrefixOrSuffix(generator); generator = removePrefixOrSuffix(generator);
if (generator.isBlank()) if (generator.isBlank())
return DocumentGenerator.unset(); return fingerprintServerTech(doc, responseHeaders);
if (generator.startsWith("AMP by WP"))
return DocumentGenerator.of("wordpress", "wordpress-amp");
String[] parts = StringUtils.split(generator, " ,:!"); String[] parts = StringUtils.split(generator, " ,:!");
if (parts.length == 0) if (parts.length == 0)
return DocumentGenerator.unset(); return fingerprintServerTech(doc, responseHeaders);
int slashIdx = parts[0].indexOf('/'); int slashIdx = parts[0].indexOf('/');
if (slashIdx >= 0) { if (slashIdx >= 0) {
@ -42,7 +45,7 @@ public class DocumentGeneratorExtractor {
} }
if (parts.length > 3) { if (parts.length > 3) {
return DocumentGenerator.unset(); // if it's still very long after trim(), it's probably a custom hand written message return fingerprintServerTech(doc, responseHeaders); // if it's still very long after trim(), it's probably a custom hand written message
} }
switch (parts[0]) { switch (parts[0]) {
@ -73,7 +76,7 @@ public class DocumentGeneratorExtractor {
} }
// Fallback logic when there is no meta tag // Fallback logic when there is no meta tag
private DocumentGenerator fingerprintByComments(Document doc) { private DocumentGenerator fingerprintServerTech(Document doc, String responseHeaders) {
for (var comment : doc.getElementsByTag("head").comments()) { for (var comment : doc.getElementsByTag("head").comments()) {
String data = comment.getData(); String data = comment.getData();
@ -81,22 +84,43 @@ public class DocumentGeneratorExtractor {
if (data.contains("Generated by javadoc")) { if (data.contains("Generated by javadoc")) {
return DocumentGenerator.of("javadoc"); return DocumentGenerator.of("javadoc");
} }
if (data.contains("Squarespace")) {
return DocumentGenerator.of("squarespace");
}
if (data.contains("phpBB")) { if (data.contains("phpBB")) {
return DocumentGenerator.of("phpbb"); return DocumentGenerator.of("phpbb");
} }
} }
for (var tag : doc.head().getElementsByTag("script")) { for (var tag : doc.head().getElementsByTag("script")) {
if (tag.html().contains("window.lemmyConfig")) { String scriptSrc = tag.attr("src");
return DocumentGenerator.of("lemmy");
} if (scriptSrc.contains("wp-content") || scriptSrc.contains("wp-includes")) {
if (tag.html().contains("URL_DOMAIN = 'wikidot.com'")) {
return DocumentGenerator.of("wikidot");
}
if (tag.attr("src").contains("wp-content")) {
return DocumentGenerator.of("wordpress", "wordpress-sneaky"); return DocumentGenerator.of("wordpress", "wordpress-sneaky");
} }
if (scriptSrc.contains("squarespace.com")) {
return DocumentGenerator.of("squarespace");
}
if (scriptSrc.contains("cdn.cloversites.com")) {
return DocumentGenerator.of("cloversites");
}
if (scriptSrc.contains("bndzgl.com")) {
return DocumentGenerator.of("bndzgl");
}
if (scriptSrc.contains("editmysite.com")) {
return DocumentGenerator.of("editmysite");
}
if (scriptSrc.contains("website-editor.net")) {
return DocumentGenerator.of("website-editor.net");
}
String scriptHtml = tag.html();
if (scriptHtml.contains("window.lemmyConfig")) {
return DocumentGenerator.of("lemmy");
}
if (scriptHtml.contains("URL_DOMAIN = 'wikidot.com'")) {
return DocumentGenerator.of("wikidot");
}
} }
for (var tag : doc.head().getElementsByTag("link")) { for (var tag : doc.head().getElementsByTag("link")) {
@ -109,6 +133,10 @@ public class DocumentGeneratorExtractor {
return DocumentGenerator.of("flarum"); return DocumentGenerator.of("flarum");
} }
if (doc.getElementById("tracpowered") != null) {
return DocumentGenerator.of("trac");
}
if (doc.getElementById("_xfClientLoadTime") != null) { if (doc.getElementById("_xfClientLoadTime") != null) {
return DocumentGenerator.of("xenforo"); return DocumentGenerator.of("xenforo");
} }
@ -117,6 +145,48 @@ public class DocumentGeneratorExtractor {
return DocumentGenerator.of("invision"); return DocumentGenerator.of("invision");
} }
if (doc.getElementById("___gatsby") != null) {
return DocumentGenerator.of("gatsby");
}
String[] headers = responseHeaders.toLowerCase().split("\n+");
for (var header : headers) {
if (header.contains("x-drupal-cache")) {
return DocumentGenerator.of("drupal");
}
if (header.contains("x-powered-by: asp.net")) {
return DocumentGenerator.of("asp.net");
}
if (header.contains("x-powered-by: php")) {
return DocumentGenerator.of("php");
}
if (header.contains("x-powered-by: wp engine")) {
return DocumentGenerator.of("wordpress", "wp-engine", "wordpress-sneaky");
}
if (header.contains("x-powered-by: statamic")) {
return DocumentGenerator.of("laravel", "statamic");
}
}
// These should be all the way down as they are the most generic
for (var header : headers) {
if (header.contains("server: mastodon")) {
return DocumentGenerator.of("mastodon");
}
if (header.contains("server: gunicorn")) {
return DocumentGenerator.of("gunicorn");
}
if (header.contains("server: nginx")) {
return DocumentGenerator.of("nginx");
}
if (header.contains("server: apache")) {
return DocumentGenerator.of("apache");
}
if (header.contains("server: cowboy")) {
return DocumentGenerator.of("cowboy"); // erlang, really?!
}
}
return DocumentGenerator.unset(); return DocumentGenerator.unset();
} }
@ -138,6 +208,11 @@ public class DocumentGeneratorExtractor {
generator = generator.substring(0, dashIdx); generator = generator.substring(0, dashIdx);
} }
int parenIdx = generator.indexOf('('); // Some strings have values like 'Drupal 9 (https://www.drupal.org)'
if (parenIdx >= 0) {
generator = generator.substring(0, parenIdx);
}
if (!StringUtils.isAsciiPrintable(generator)) if (!StringUtils.isAsciiPrintable(generator))
return ""; return "";
@ -170,11 +245,18 @@ public class DocumentGeneratorExtractor {
final GeneratorType type = switch (parts[0]) { final GeneratorType type = switch (parts[0]) {
case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity", case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity",
"modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms", "modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms",
"typo3", "dotnetnuke", "cms", "coremedia", "dspace" "typo3", "dotnetnuke", "cms", "coremedia", "dspace", "laravel", "trac", "bunnypress", "astro",
"ghost", "publii"
-> GeneratorType.CMS; -> GeneratorType.CMS;
case "wix.com", "one.com", "wpbakery", "claris", "wordpress.com", "hubspot", case "wix.com", "one.com", "wpbakery", "claris", "wordpress.com", "hubspot",
"visual_composer", "mobirise", "everweb", "rapidweaver", "shorthand", "visual_composer", "mobirise", "everweb", "rapidweaver", "shorthand",
"visual", "nitropack", "visual", "nitropack", "squarespace", "editmysite", "websiteeditor.net",
"svbtle.com", "write.as", "montaigne.io", // blogging platforms, maybe should be in another category?
"cloversites", // clover is a church-oriented website builder, found that kinda neat
"bndzgl", // band websites ..?
/* these are not SAAS but close enough */ /* these are not SAAS but close enough */
"redux", "bootply" "redux", "bootply"
-> GeneratorType.SAAS; -> GeneratorType.SAAS;
@ -185,7 +267,8 @@ public class DocumentGeneratorExtractor {
"pdf2htmlex", "nvu", "mozilla", "golive", "tenfingers", "publisher", "pdf2htmlex", "nvu", "mozilla", "golive", "tenfingers", "publisher",
"allaire", "neooffice" "allaire", "neooffice"
-> GeneratorType.BOOMER_STATIC; -> GeneratorType.BOOMER_STATIC;
case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome" case "hugo", "jekyll", "hakyll", "nikola", "zola", "olivetti", "pelican", "sushy", "hexo", "eleventy",
"gridsome", "vuepress", "docusaurus", "docpad", "techou", "quarto", "soupault"
-> GeneratorType.ZOOMER_STATIC; -> GeneratorType.ZOOMER_STATIC;
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano", case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano",
"notepad.exe", "gedit", "me", "notepad.exe", "gedit", "me",
@ -198,9 +281,9 @@ public class DocumentGeneratorExtractor {
-> GeneratorType.FORUM; -> GeneratorType.FORUM;
case "mediawiki", "dokuwiki", "wikidot", "sharepoint" case "mediawiki", "dokuwiki", "wikidot", "sharepoint"
-> GeneratorType.WIKI; -> GeneratorType.WIKI;
case "pandoc", "mkdocs", "doxygen", "javadoc" case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc"
-> GeneratorType.DOCS; -> GeneratorType.DOCS;
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic" case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass"
-> GeneratorType.ECOMMERCE_AND_SPAM; -> GeneratorType.ECOMMERCE_AND_SPAM;
default default
-> GeneratorType.UNKNOWN; -> GeneratorType.UNKNOWN;
@ -216,7 +299,7 @@ public class DocumentGeneratorExtractor {
public static DocumentGenerator multiple() { public static DocumentGenerator multiple() {
// It's *generally* WordPress or the like that injects multiple generator tags // It's *generally* WordPress or the like that injects multiple generator tags
return new DocumentGenerator(GeneratorType.CMS, List.of(defaultValue)); return new DocumentGenerator(GeneratorType.CMS, List.of("wordpress", "wp-best-guess"));
} }
} }

View File

@ -19,14 +19,13 @@ public class DocumentValuator {
int textLength) throws DisqualifiedException { int textLength) throws DisqualifiedException {
double scriptPenalty = getScriptPenalty(parsedDocument); double scriptPenalty = getScriptPenalty(parsedDocument);
int textBodyLength = textLength;
int rawLength = crawledDocument.documentBody.length(); int rawLength = crawledDocument.documentBody.length();
if (textBodyLength == 0) { if (textLength == 0) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH); throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
} }
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
+ htmlStandard.offset + htmlStandard.offset
- scriptPenalty; - scriptPenalty;
} }

View File

@ -21,25 +21,29 @@ import java.util.Set;
@Singleton @Singleton
public class FeatureExtractor { public class FeatureExtractor {
private static final List<String> trackers = List.of("adform.net", private static final List<String> innocentTrackers = List.of(
"twitter.com",
"bing.com",
"msn.com");
private static final List<String> shittyTrackers = List.of("adform.net",
"connect.facebook", "connect.facebook",
"facebook.com/tr",
"googletagmanager.com", "googletagmanager.com",
"googlesyndication.com", "googlesyndication.com",
"google.com",
"twitter.com",
"smartadserver.com", "smartadserver.com",
"doubleclick.com", "doubleclick.com",
"2mdn.com", "2mdn.com",
"dmtry.com", "dmtry.com",
"bing.com",
"msn.com",
"amazon-adsystem.com", "amazon-adsystem.com",
"alexametrics.com", "alexametrics.com",
"rubiconproject.com", "rubiconproject.com",
"chango.com", "chango.com",
"d5nxst8fruw4z.cloudfront.net", "d5nxst8fruw4z.cloudfront.net",
"d31qbv1cthcecs.cloudfront.net", "d31qbv1cthcecs.cloudfront.net",
"linkedin.com"); "linkedin.com",
"perfectaudience.com",
"marketingautomation.services",
"usefathom");
private final AdblockSimulator adblockSimulator; private final AdblockSimulator adblockSimulator;
private final RecipeDetector recipeDetector; private final RecipeDetector recipeDetector;
@ -71,21 +75,119 @@ public class FeatureExtractor {
} }
for (var scriptTag : scriptTags) { for (var scriptTag : scriptTags) {
if (isJavascriptTag(scriptTag)) { final String type = scriptTag.attr("type");
if ("application/ld+json".equalsIgnoreCase(type)) {
features.add(HtmlFeature.JSON_LD);
}
else {
features.add(HtmlFeature.JS); features.add(HtmlFeature.JS);
} }
} }
// 500 IQ web developers use <link> error or load handlers if (!doc.head().getElementsByTag("viewport").isEmpty()) {
// sneakily load JS without explicit script tags features.add(HtmlFeature.VIEWPORT);
for (var link : doc.head().getElementsByTag("link")) { }
if (link.hasAttr("onerror")) { for (var atag : doc.body().getElementsByTag("a")) {
features.add(HtmlFeature.JS); var rel = atag.attr("rel");
break; if (rel.equals("dofollow")) {
features.add(HtmlFeature.DOFOLLOW_LINK);
} }
if (link.hasAttr("onload")) { }
if (!doc.getElementsByTag("date").isEmpty()) {
features.add(HtmlFeature.DATE_TAG);
}
if (!doc.getElementsByTag("noscript").isEmpty()) {
features.add(HtmlFeature.NOSCRIPT_TAG);
}
for (var link : doc.head().getElementsByTag("link")) {
// 500 IQ web developers use <link> error or load handlers
// sneakily load JS without explicit script tags
if (link.hasAttr("onerror"))
features.add(HtmlFeature.JS); features.add(HtmlFeature.JS);
break; if (link.hasAttr("onload"))
features.add(HtmlFeature.JS);
if (link.hasAttr("pingback")) {
features.add(HtmlFeature.PINGBACK);
}
var href = link.attr("href");
if (href.contains("indieauth"))
features.add(HtmlFeature.INDIEAUTH);
var rel = link.attr("rel");
if (rel.equals("webmention"))
features.add(HtmlFeature.WEBMENTION);
if (rel.equals("me"))
features.add(HtmlFeature.ME_TAG);
if (rel.equals("next"))
features.add(HtmlFeature.NEXT_TAG);
if (rel.equals("alternate") && link.hasAttr("type"))
features.add(HtmlFeature.FEED);
if (rel.equals("dns-prefetch"))
features.add(HtmlFeature.DNS_PREFETCH);
if (rel.equals("preload"))
features.add(HtmlFeature.PRELOAD);
if (rel.equals("preconnect"))
features.add(HtmlFeature.PRECONNECT);
if (rel.equals("amphtml"))
features.add(HtmlFeature.AMPHTML);
if (rel.equals("apple-touch-icon"))
features.add(HtmlFeature.APPLE_TOUCH_ICON);
}
for (var meta : doc.head().getElementsByTag("meta")) {
// <meta name="robots" content="index,follow">
if (meta.attr("name").equals("robots")) {
var content = meta.attr("content");
if (!content.contains("noindex") && content.contains("index")) {
features.add(HtmlFeature.ROBOTS_INDEX);
}
if (!content.contains("nofollow") && content.contains("follow")) {
features.add(HtmlFeature.ROBOTS_FOLLOW);
}
if (content.contains("noodp")) {
features.add(HtmlFeature.ROBOTS_NOODP);
}
if (content.contains("noydir")) {
features.add(HtmlFeature.ROBOTS_NOYDIR);
}
}
if (meta.attr("profile").contains("gmpg")) {
features.add(HtmlFeature.PROFILE_GMPG);
}
if (meta.attr("property").equals("og:description")) {
features.add(HtmlFeature.OPENGRAPH);
}
if (meta.attr("property").equals("og:image")) {
features.add(HtmlFeature.OPENGRAPH_IMAGE);
}
if (meta.attr("name").equals("twitter:description")) {
features.add(HtmlFeature.TWITTERCARD);
}
if (meta.attr("name").equals("twitter:image")) {
features.add(HtmlFeature.TWITTERCARD_IMAGE);
}
if (meta.attr("http-equiv").equals("origin-trial")) {
features.add(HtmlFeature.ORIGIN_TRIAL);
} }
} }
@ -100,14 +202,74 @@ public class FeatureExtractor {
} }
for (var scriptTag : scriptTags) { for (var scriptTag : scriptTags) {
if (hasTrackingScript(scriptTag)) { if (hasInvasiveTrackingScript(scriptTag)) {
features.add(HtmlFeature.TRACKING); features.add(HtmlFeature.TRACKING_INNOCENT);
break; features.add(HtmlFeature.TRACKING_EVIL);
}
else if (hasNaiveTrackingScript(scriptTag)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
}
if (scriptTag.hasAttr("didomi/javascript")) {
features.add(HtmlFeature.DIDOMI);
}
String src = scriptTag.attr("src");
if (src.contains("OneSignalSDK")) {
features.add(HtmlFeature.ONESIGNAL);
}
String scriptText = scriptTag.html();
if (scriptText.contains("'pd.js'")) {
features.add(HtmlFeature.PARDOT);
}
if (scriptText.contains("https://cmp.quantcast.com")) {
features.add(HtmlFeature.QUANTCAST);
}
if (scriptText.contains("https://quantcast.mgr.consensu.org")) {
features.add(HtmlFeature.QUANTCAST);
}
if (scriptText.contains("https://cdn.cookielaw.org")) {
features.add(HtmlFeature.COOKIELAW);
}
if (scriptText.contains("_linkedin_data_partner_id")) {
features.add(HtmlFeature.TRACKING_EVIL);
}
if (scriptText.contains("window.OneSignal")) {
features.add(HtmlFeature.ONESIGNAL);
}
if (scriptText.contains("connect.facebook.net")) {
features.add(HtmlFeature.TRACKING_EVIL);
}
if (scriptText.contains("hotjar.com")) {
features.add(HtmlFeature.TRACKING_INNOCENT);
}
}
for (var noscript : doc.getElementsByTag("noscript")) {
for (var iframe : noscript.getElementsByTag("iframe")) {
if (hasInvasiveTrackingScript(iframe)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING_EVIL);
}
else if (hasNaiveTrackingScript(iframe)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
}
}
for (var img : noscript.getElementsByTag("img")) {
if (hasInvasiveTrackingScript(img)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING_EVIL);
}
else if (hasNaiveTrackingScript(img)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
}
} }
} }
if (scriptTags.html().contains("google-analytics.com")) { if (scriptTags.html().contains("google-analytics.com")) {
features.add(HtmlFeature.TRACKING); features.add(HtmlFeature.TRACKING_INNOCENT);
} }
for (var aTag : doc.getElementsByTag("a")) { for (var aTag : doc.getElementsByTag("a")) {
@ -129,30 +291,33 @@ public class FeatureExtractor {
return features; return features;
} }
private boolean hasTrackingScript(Element scriptTag) { private boolean hasInvasiveTrackingScript(Element scriptTag) {
return hasTrackingScript(scriptTag.attr("src")); return hasInvasiveTrackingScript(scriptTag.attr("src"));
} }
private boolean hasNaiveTrackingScript(Element scriptTag) {
return hasNaiveTrackingScript(scriptTag.attr("src"));
}
private boolean hasInvasiveTrackingScript(String src) {
private boolean hasTrackingScript(String scriptText) { for (var tracker : shittyTrackers) {
if (src.contains(tracker)) {
for (var tracker : trackers) {
if (scriptText.contains(tracker)) {
return true; return true;
} }
} }
return false; return false;
} }
private boolean isJavascriptTag(Element scriptTag) { private boolean hasNaiveTrackingScript(String src) {
final String type = scriptTag.attr("type");
if ("application/ld+json".equalsIgnoreCase(type)) { for (var tracker : innocentTrackers) {
return false; if (src.contains(tracker)) {
return true;
}
} }
return false;
return true;
} }
boolean isAmazonAffiliateLink(Element aTag) { boolean isAmazonAffiliateLink(Element aTag) {
final String href = aTag.attr("href").toLowerCase(); final String href = aTag.attr("href").toLowerCase();

View File

@ -111,9 +111,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final EdgeUrl url = new EdgeUrl(crawledDocument.url); final EdgeUrl url = new EdgeUrl(crawledDocument.url);
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc); final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, crawledDocument.headers);
final var specialization = htmlProcessorSpecializations.select(generatorParts); final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
if (!specialization.shouldIndex(url)) { if (!specialization.shouldIndex(url)) {
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT); throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
@ -167,7 +167,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
.addGenerator(generatorParts.keywords()) .addGenerator(generatorParts.keywords())
.build(); .build();
words.addAllSyntheticTerms(tagWords); words.addAllSyntheticTerms(tagWords);
specialization.amendWords(doc, words);
getLinks(url, ret, doc, words); getLinks(url, ret, doc, words);
@ -216,8 +218,23 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
return true; return true;
} }
// Annoying wordpress crap // Annoying blog crap
if (url.path.startsWith("/tag/") && url.path.endsWith("/")) { if (url.path.contains("/tag/") && url.path.endsWith("/")) {
return true;
}
if (url.path.contains("/tags/") && url.path.endsWith("/")) {
return true;
}
if (url.path.contains("/category/") && url.path.endsWith("/")) {
return true;
}
if (url.path.contains("/categories/") && url.path.endsWith("/")) {
return true;
}
if (url.path.contains("/section/") && url.path.endsWith("/")) {
return true;
}
if (url.path.contains("/sections/") && url.path.endsWith("/")) {
return true; return true;
} }
return false; return false;

View File

@ -0,0 +1,210 @@
package nu.marginalia.converting.processor.plugin.specialization;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.summary.SummaryExtractor;
import org.apache.logging.log4j.util.Strings;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeVisitor;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/** The blog specialization is used for blogs, and makes heavy assumptions about the nature of the document
* that aren't generally true, but if the categorization is correct, will yield much better results.
*/
@Singleton
public class BlogSpecialization extends DefaultSpecialization {
@Inject
public BlogSpecialization(SummaryExtractor summaryExtractor) {
super(summaryExtractor);
}
@Override
public Document prune(Document original) {
var doc = original.clone();
// Remove all nav junk, comments and other stuff
doc.filter(new BlogPruningFilter());
// If there is an article tag, use that as the root
var articleTags = doc.getElementsByTag("article");
var firstArticle = articleTags.first();
if (firstArticle != null) {
var art = firstArticle.clone();
doc.body().empty();
doc.body().appendChild(art);
return doc;
}
// Use the default pruning as a fallback
return super.prune(doc);
}
@Override
public String getSummary(Document original, Set<String> importantWords) {
return super.getSummary(original, importantWords);
}
private final static List<String> badPathElements =
List.of("/tag/", "/tags/", "/tagged/", "/category/", "/categories/", "/section/", "/sections/", "/page/", "/author/");
private final static Predicate<String> dateIndexTest1 = Pattern.compile("^/(\\d{4}/(\\d{2}/){0,2}?)$").asMatchPredicate();
private final static Predicate<String> dateIndexTest2 = Pattern.compile("^/(\\d{2}/){1,2}$").asMatchPredicate();
@Override
public boolean shouldIndex(EdgeUrl url) {
String path = url.path;
// Don't index the root path for blogs, as it is usually an ephemeral list of all posts
if ("/".equals(path)) return false;
// Likewise for the blog's home page
if (path.endsWith("/blog/")) return false;
if (path.endsWith("/log/")) return false;
if (path.endsWith("/weblog/")) return false;
if (path.endsWith("/posts/")) return false;
if (path.endsWith("/articles/")) return false;
// Refuse paths that contain any of the bad path elements
for (String badPathElement : badPathElements) {
if (path.contains(badPathElement)) return false;
}
// We don't want chronological listings
if (dateIndexTest1.test(path)) return false;
if (dateIndexTest2.test(path)) return false;
return true;
}
private static PorterStemmer ps = new PorterStemmer();
public void amendWords(Document doc, DocumentKeywordsBuilder words) {
var tagExtractor = new BlogTagExtractor();
doc.traverse(tagExtractor);
var tags = tagExtractor.getTags();
if (!tags.isEmpty()) {
var stemmed = tags.stream().map(ps::stemWord).collect(Collectors.toSet());
words.setFlagOnMetadataForWords(WordFlags.Subjects, stemmed);
Set<String> specialTags = tags.stream().map(s -> "tag:" + s).collect(Collectors.toSet());
words.addAllSyntheticTerms(specialTags);
}
}
/** Removes all the non-content elements from the document,
* making strong blog-specific assumptions about the nature of
* the layout */
private static class BlogPruningFilter implements NodeFilter {
private static final List<String> badClassElements = Arrays.asList("comment", "reply", "sidebar", "header", "footer", "nav");
private static final List<String> badIdElements = Arrays.asList("comments", "header", "footer", "nav");
@Override
public FilterResult head(Node node, int depth) {
if (node instanceof Element el) {
String classes = el.attr("class");
String id = el.id();
for (String badClassElement : badClassElements) {
if (classes.contains(badClassElement)) {
return FilterResult.REMOVE;
}
}
for (String badIdElement : badIdElements) {
if (id.contains(badIdElement)) {
return FilterResult.REMOVE;
}
}
}
return FilterResult.CONTINUE;
}
}
// Extract tag keywords from the blog post
public static class BlogTagExtractor implements NodeVisitor {
private final Set<String> tags = new HashSet<>();
int lookForTags = -1;
public Set<String> getTags() {
Set<String> tagsClean = tags.stream().map(String::toLowerCase).map(this::cleanTag).filter(Strings::isNotBlank).collect(Collectors.toSet());
// If there are more than 5 tags, it's probably a global tag listing
// and not a post-specific tag listing
if (tagsClean.size() > 5)
return Set.of();
return tagsClean;
}
private final Pattern splitterPattern = Pattern.compile("\\s+");
private final Pattern noisePattern = Pattern.compile("[^a-zA-Z0-9]");
// This is hideously expensive but blog posts are relatively few and far between
private String cleanTag(String tag) {
String[] parts = splitterPattern.split(tag);
if (parts.length > 3)
return "";
for (int i = 0; i < parts.length; i++) {
if (parts[i].startsWith("#"))
parts[i] = parts[i].substring(1);
else if (parts[i].startsWith("(") && parts[i].endsWith(")"))
parts[i] = "";
else
parts[i] = noisePattern.matcher(parts[i]).replaceAll("");
if (parts[i].equals("tags"))
parts[i] = "";
}
return Arrays.stream(parts).filter(Strings::isNotBlank).collect(Collectors.joining("_"));
}
@Override
public void head(Node node, int depth) {
if (!(node instanceof Element el)) {
return;
}
if (lookForTags < 0) {
if (el.attr("class").contains("tags")) {
lookForTags = depth;
}
if (el.tagName().equals("a")) {
if (el.attr("class").contains("tag")
|| el.attr("href").startsWith("/tag/"))
tags.add(el.text());
}
}
else if (el.tagName().equals("a")) {
tags.add(el.text());
}
}
public void tail(Node node, int depth) {
if (depth <= lookForTags) { lookForTags = -1; }
}
}
}

View File

@ -2,7 +2,9 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.converting.processor.ConverterDomainTypes;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -10,27 +12,41 @@ import java.util.Set;
@Singleton @Singleton
public class HtmlProcessorSpecializations { public class HtmlProcessorSpecializations {
private final ConverterDomainTypes domainTypes;
private final LemmySpecialization lemmySpecialization; private final LemmySpecialization lemmySpecialization;
private final XenForoSpecialization xenforoSpecialization; private final XenForoSpecialization xenforoSpecialization;
private final PhpBBSpecialization phpBBSpecialization; private final PhpBBSpecialization phpBBSpecialization;
private final JavadocSpecialization javadocSpecialization; private final JavadocSpecialization javadocSpecialization;
private final BlogSpecialization blogSpecialization;
private final DefaultSpecialization defaultSpecialization; private final DefaultSpecialization defaultSpecialization;
@Inject @Inject
public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization, public HtmlProcessorSpecializations(ConverterDomainTypes domainTypes,
LemmySpecialization lemmySpecialization,
XenForoSpecialization xenforoSpecialization, XenForoSpecialization xenforoSpecialization,
PhpBBSpecialization phpBBSpecialization, PhpBBSpecialization phpBBSpecialization,
JavadocSpecialization javadocSpecialization, JavadocSpecialization javadocSpecialization,
BlogSpecialization blogSpecialization,
DefaultSpecialization defaultSpecialization) { DefaultSpecialization defaultSpecialization) {
this.domainTypes = domainTypes;
this.lemmySpecialization = lemmySpecialization; this.lemmySpecialization = lemmySpecialization;
this.xenforoSpecialization = xenforoSpecialization; this.xenforoSpecialization = xenforoSpecialization;
this.phpBBSpecialization = phpBBSpecialization; this.phpBBSpecialization = phpBBSpecialization;
this.javadocSpecialization = javadocSpecialization; this.javadocSpecialization = javadocSpecialization;
this.blogSpecialization = blogSpecialization;
this.defaultSpecialization = defaultSpecialization; this.defaultSpecialization = defaultSpecialization;
} }
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */ /** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
public HtmlProcessorSpecializationIf select(DocumentGeneratorExtractor.DocumentGenerator generator) { public HtmlProcessorSpecializationIf select(
DocumentGeneratorExtractor.DocumentGenerator generator,
EdgeUrl url)
{
if (domainTypes.isBlog(url.domain)) {
return blogSpecialization;
}
if (generator.keywords().contains("lemmy")) { if (generator.keywords().contains("lemmy")) {
return lemmySpecialization; return lemmySpecialization;
} }
@ -58,5 +74,8 @@ public class HtmlProcessorSpecializations {
default boolean shouldIndex(EdgeUrl url) { return true; } default boolean shouldIndex(EdgeUrl url) { return true; }
default double lengthModifier() { return 1.0; } default double lengthModifier() { return 1.0; }
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
} }
} }

View File

@ -4,6 +4,8 @@ import com.google.inject.AbstractModule;
import com.google.inject.name.Names; import com.google.inject.name.Names;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.ConverterDomainTypes;
import org.mockito.Mockito;
public class ConvertingIntegrationTestModule extends AbstractModule { public class ConvertingIntegrationTestModule extends AbstractModule {
public void configure() { public void configure() {
@ -13,5 +15,6 @@ public class ConvertingIntegrationTestModule extends AbstractModule {
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class));
} }
} }

View File

@ -0,0 +1,17 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class BlogSpecializationTest {
@Test
void shouldIndex() throws Exception {
var spec = new BlogSpecialization(null);
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/22/")));
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/")));
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/00/22/")));
}
}

View File

@ -34,7 +34,7 @@ class JavadocSpecializationTest {
@Test @Test
void generatorExtraction() { void generatorExtraction() {
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread)); var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), "");
System.out.println(gen); System.out.println(gen);
} }

View File

@ -37,8 +37,8 @@ class LemmySpecializationTest {
@Test @Test
void generatorExtraction() { void generatorExtraction() {
var generatorIndex = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyIndexHtml)); var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), "");
var generatorPost = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyPost)); var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), "");
System.out.println(generatorIndex); System.out.println(generatorIndex);
System.out.println(generatorPost); System.out.println(generatorPost);

View File

@ -34,7 +34,7 @@ class XenForoSpecializationTest {
@Test @Test
void generatorExtraction() { void generatorExtraction() {
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread)); var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), "");
System.out.println(gen); System.out.println(gen);
} }

View File

@ -20,7 +20,8 @@ public class LinkFilterSelector {
} }
if (isLemmy(head)) { if (isLemmy(head)) {
return url -> url.path.startsWith("/post/") || url.path.startsWith("/c/"); return url -> url.path.startsWith("/post/")
|| (url.path.startsWith("/c/") && !url.path.contains("@"));
} }
if (isDiscourse(head)) { if (isDiscourse(head)) {
return url -> url.path.startsWith("/t/") || url.path.contains("/latest"); return url -> url.path.startsWith("/t/") || url.path.contains("/latest");

View File

@ -211,7 +211,7 @@ public class IndexQueryService {
return switch (priority) { return switch (priority) {
case BEST -> false; case BEST -> false;
case GOOD -> resultCount > params.fetchSize / 4; case GOOD -> resultCount > params.fetchSize / 4;
case FALLBACK -> resultCount > params.fetchSize / 256; case FALLBACK -> resultCount > params.fetchSize / 8;
}; };
} }

View File

@ -121,7 +121,7 @@ public class UrlDetails {
for (var problem :EnumSet.of( for (var problem :EnumSet.of(
HtmlFeature.JS, HtmlFeature.JS,
HtmlFeature.TRACKING, HtmlFeature.TRACKING_INNOCENT,
HtmlFeature.AFFILIATE_LINK, HtmlFeature.AFFILIATE_LINK,
HtmlFeature.COOKIES, HtmlFeature.COOKIES,
HtmlFeature.ADVERTISEMENT)) { HtmlFeature.ADVERTISEMENT)) {
@ -156,7 +156,7 @@ public class UrlDetails {
return HtmlFeature.hasFeature(features, HtmlFeature.JS); return HtmlFeature.hasFeature(features, HtmlFeature.JS);
} }
public boolean isTracking() { public boolean isTracking() {
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING); return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT);
} }
public boolean isAffiliate() { public boolean isAffiliate() {
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK); return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);

View File

@ -1,18 +1,12 @@
package nu.marginalia.tools.experiments; package nu.marginalia.tools.experiments;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.plugin.specialization.BlogSpecialization;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.tools.Experiment; import nu.marginalia.tools.Experiment;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import java.util.HashSet;
import java.util.Set;
public class DebugConverterExperiment extends Experiment { public class DebugConverterExperiment extends Experiment {
@ -24,56 +18,25 @@ public class DebugConverterExperiment extends Experiment {
} }
Set<String> seenGenerators = new HashSet<>();
@Override @Override
public boolean process(CrawledDomain domain) { public boolean process(CrawledDomain domain) {
if (domain.doc == null) return true; if (domain.doc == null) return true;
var dge = new DocumentGeneratorExtractor();
for (var doc : domain.doc) { for (var doc : domain.doc) {
if (doc.documentBody == null) continue; if (doc.documentBody == null) continue;
var parsed = Jsoup.parse(doc.documentBody.decode()); var parsed = Jsoup.parse(doc.documentBody.decode());
parsed.getElementsByTag("head").comments()
.stream().filter(c -> {
String data = c.getData();
if (data.contains("<script"))
return false;
if (data.contains("[if"))
return false;
if (data.contains("shim"))
return false;
return data.contains("Generated by") || data.contains("generated by")
|| data.contains("Powered by") || data.contains("powered by");
}).forEach(System.out::println);
var generators = dge.generatorCleaned(parsed); var tagExtractor = new BlogSpecialization.BlogTagExtractor();
for (var g : generators.keywords()) { parsed.traverse(tagExtractor);
if (seenGenerators.add(g)) { var tags = tagExtractor.getTags();
System.out.println(g + "->" + generators.type()); if (!tags.isEmpty()) {
if (generators.type() == GeneratorType.UNKNOWN) { System.out.println(tags);
System.out.println(parsed.select("meta[name=generator]")
.attr("content"));
System.out.println(doc.url);
}
}
} }
} }
//
// var ret = domainProcessor.process(domain);
//
//
// ret.documents.stream()
// .filter(ProcessedDocument::isProcessedFully)
// .peek(d -> System.out.println(d.url))
// .map(d -> d.details.metadata)
// .forEach(System.out::println);
return true; return true;
} }