mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Better fingerprinting (#35)
* Better fingerprinting for server tech * Many more features in FeatureExtractor * Blog specialization * SiteType table
This commit is contained in:
parent
ae9537b68e
commit
0f9b90eb1c
@ -2,6 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
id 'jvm-test-suite'
|
||||
|
||||
}
|
||||
|
||||
java {
|
||||
@ -32,8 +33,14 @@ dependencies {
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
}
|
||||
|
||||
|
||||
test {
|
||||
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||
maxHeapSize = "8G"
|
||||
@ -47,4 +54,3 @@ task fastTests(type: Test) {
|
||||
excludeTags "slow"
|
||||
}
|
||||
}
|
||||
|
||||
|
179
code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java
Normal file
179
code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java
Normal file
@ -0,0 +1,179 @@
|
||||
package nu.marginalia.db;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.id.EdgeIdList;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** A list of domains that are known to be of a certain type */
|
||||
@Singleton
|
||||
public class DomainTypes {
|
||||
|
||||
public enum Type {
|
||||
BLOG,
|
||||
TEST
|
||||
};
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(DomainTypes.class);
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
public DomainTypes(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
/** Get all domains of a certain type, including domains that are not in the EC_DOMAIN table */
|
||||
public List<String> getAllDomainsByType(Type type) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM DOMAIN_SELECTION INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID
|
||||
WHERE DOMAIN_SELECTION_TYPE.NAME = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, type.name());
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
ret.add(rs.getString(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Retrieve the EdgeId of all domains of a certain type,
|
||||
* ignoring entries that are not in the EC_DOMAIN table */
|
||||
public EdgeIdList<EdgeDomain> getKnownDomainsByType(Type type) {
|
||||
EdgeIdList<EdgeDomain> ret = new EdgeIdList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT EC_DOMAIN.ID
|
||||
FROM DOMAIN_SELECTION
|
||||
INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID
|
||||
INNER JOIN EC_DOMAIN ON DOMAIN_SELECTION.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
|
||||
WHERE DOMAIN_SELECTION_TYPE.NAME = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, type.name());
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
ret.add(rs.getInt(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Reload the list of domains of a certain type from the source */
|
||||
public void reloadDomainsList(Type type) throws IOException, SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT SOURCE, ID FROM DOMAIN_SELECTION_TYPE WHERE NAME = ?
|
||||
""");
|
||||
var deleteStatement = conn.prepareStatement("""
|
||||
DELETE FROM DOMAIN_SELECTION WHERE DOMAIN_TYPE_ID = ?
|
||||
""");
|
||||
var insertStatement = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO DOMAIN_SELECTION (DOMAIN_NAME, DOMAIN_TYPE_ID) VALUES (?, ?)
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setString(1, type.name());
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
if (!rsp.next()) {
|
||||
throw new RuntimeException("No such domain selection type: " + type);
|
||||
}
|
||||
|
||||
var source = rsp.getString(1);
|
||||
int typeId = rsp.getInt(2);
|
||||
|
||||
List<String> downloadDomains = downloadDomainsList(source);
|
||||
|
||||
try {
|
||||
conn.setAutoCommit(false);
|
||||
deleteStatement.setInt(1, typeId);
|
||||
deleteStatement.executeUpdate();
|
||||
|
||||
for (String domain : downloadDomains) {
|
||||
insertStatement.setString(1, domain);
|
||||
insertStatement.setInt(2, typeId);
|
||||
insertStatement.executeUpdate();
|
||||
// Could use batch insert here, but this executes infrequently, so it's not worth the hassle
|
||||
}
|
||||
|
||||
conn.commit();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
conn.rollback();
|
||||
throw ex;
|
||||
}
|
||||
finally {
|
||||
conn.setAutoCommit(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> downloadDomainsList(String source) throws IOException {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
logger.info("Downloading domain list from {}", source);
|
||||
|
||||
try (var br = new BufferedReader(new InputStreamReader(new URL(source).openStream()))) {
|
||||
String line;
|
||||
|
||||
while ((line = br.readLine()) != null) {
|
||||
line = cleanDomainListLine(line);
|
||||
|
||||
|
||||
if (isValidDomainListEntry(line))
|
||||
ret.add(line);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("-- found {}", ret.size());
|
||||
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private String cleanDomainListLine(String line) {
|
||||
line = line.trim();
|
||||
|
||||
int hashIdx = line.indexOf('#');
|
||||
if (hashIdx >= 0)
|
||||
line = line.substring(0, hashIdx).trim();
|
||||
|
||||
return line;
|
||||
}
|
||||
|
||||
private boolean isValidDomainListEntry(String line) {
|
||||
if (line.isBlank())
|
||||
return false;
|
||||
if (!line.matches("[a-z0-9\\-.]+"))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SELECTION_TYPE (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
NAME VARCHAR(255) UNIQUE,
|
||||
SOURCE VARCHAR(255) NOT NULL
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE TABLE DOMAIN_SELECTION (
|
||||
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
|
||||
DOMAIN_TYPE_ID INT,
|
||||
FOREIGN KEY (DOMAIN_TYPE_ID) REFERENCES DOMAIN_SELECTION_TYPE(ID) ON DELETE CASCADE
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
INSERT IGNORE INTO DOMAIN_SELECTION_TYPE(NAME, SOURCE)
|
||||
VALUES ('BLOG', 'https://raw.githubusercontent.com/MarginaliaSearch/submit-site-to-marginalia-search/master/blogs.txt'),
|
||||
('TEST', 'https://downloads.marginalia.nu/domain-list-test.txt');
|
@ -0,0 +1,63 @@
|
||||
package nu.marginalia.db;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Testcontainers
|
||||
public class DomainTypesTest {
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withInitScript("sql/current/10-domain-type.sql")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
static DomainTypes domainTypes;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() {
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
|
||||
domainTypes = new DomainTypes(dataSource);
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void teardown() {
|
||||
dataSource.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void reloadDomainsList() throws SQLException, IOException {
|
||||
domainTypes.reloadDomainsList(DomainTypes.Type.TEST);
|
||||
|
||||
var downloadedDomains = new HashSet<>(domainTypes.getAllDomainsByType(DomainTypes.Type.TEST));
|
||||
|
||||
var expectedDomains = Set.of("www.marginalia.nu", "search.marginalia.nu",
|
||||
"encyclopedia.marginalia.nu", "memex.marginalia.nu");
|
||||
|
||||
assertEquals(4, downloadedDomains.size());
|
||||
assertEquals(Set.of(), Sets.symmetricDifference(expectedDomains, downloadedDomains));
|
||||
}
|
||||
|
||||
}
|
@ -6,7 +6,10 @@ public enum HtmlFeature {
|
||||
MEDIA( "special:media"),
|
||||
JS("special:scripts"),
|
||||
AFFILIATE_LINK( "special:affiliate"),
|
||||
TRACKING("special:tracking"),
|
||||
TRACKING_INNOCENT("special:tracking"),
|
||||
TRACKING_EVIL("special:tracking2"),
|
||||
|
||||
VIEWPORT("special:viewport"),
|
||||
|
||||
COOKIES("special:cookies"),
|
||||
CATEGORY_FOOD("category:food"),
|
||||
@ -15,8 +18,43 @@ public enum HtmlFeature {
|
||||
|
||||
GA_SPAM("special:gaspam"),
|
||||
|
||||
UNKNOWN("special:uncategorized")
|
||||
;
|
||||
/** For fingerprinting and ranking */
|
||||
OPENGRAPH("special:opengraph"),
|
||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||
TWITTERCARD("special:twittercard"),
|
||||
TWITTERCARD_IMAGE("special:twittercard:image"),
|
||||
FONTAWSESOME("special:fontawesome"),
|
||||
GOOGLEFONTS("special:googlefonts"),
|
||||
DNS_PREFETCH("special:dnsprefetch"),
|
||||
PRELOAD("special:preload"),
|
||||
PRECONNECT("special:preconnect"),
|
||||
PINGBACK("special:pingback"),
|
||||
FEED("special:feed"),
|
||||
WEBMENTION("special:webmention"),
|
||||
INDIEAUTH("special:indieauth"),
|
||||
ME_TAG("special:metag"),
|
||||
NEXT_TAG("special:nexttag"),
|
||||
AMPHTML("special:amphtml"),
|
||||
JSON_LD("special:jsonld"),
|
||||
ORIGIN_TRIAL("special:origintrial"),
|
||||
PROFILE_GMPG("special:profile-gpmg"),
|
||||
QUANTCAST("special:quantcast"),
|
||||
COOKIELAW("special:cookielaw"),
|
||||
DIDOMI("special:didomi"),
|
||||
PARDOT("special:pardot"),
|
||||
ONESIGNAL("special:onesignal"),
|
||||
DATE_TAG("special:date_tag"),
|
||||
NOSCRIPT_TAG("special:noscript_tag"),
|
||||
|
||||
ROBOTS_INDEX("robots:index"),
|
||||
ROBOTS_FOLLOW("robots:follow"),
|
||||
ROBOTS_NOODP("robots:noodp"),
|
||||
ROBOTS_NOYDIR("robots:noydir"),
|
||||
DOFOLLOW_LINK("special:dofollow"),
|
||||
APPLE_TOUCH_ICON("special:appleicon"),
|
||||
|
||||
UNKNOWN("special:uncategorized");
|
||||
|
||||
|
||||
private final String keyword;
|
||||
|
||||
|
@ -17,6 +17,15 @@ public class DocumentLanguageData {
|
||||
public final DocumentSentence[] titleSentences;
|
||||
public final TObjectIntHashMap<String> wordCount;
|
||||
|
||||
/** for test convenience */
|
||||
public static DocumentLanguageData empty() {
|
||||
return new DocumentLanguageData(
|
||||
new DocumentSentence[0],
|
||||
new DocumentSentence[0],
|
||||
new TObjectIntHashMap<>()
|
||||
);
|
||||
}
|
||||
|
||||
public int totalNumWords() {
|
||||
int ret = 0;
|
||||
for (int i = 0; i < sentences.length; i++) {
|
||||
|
@ -29,6 +29,7 @@ dependencies {
|
||||
implementation project(':code:api:index-api')
|
||||
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service-discovery')
|
||||
|
@ -5,6 +5,7 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import plan.CrawlPlanLoader;
|
||||
import plan.CrawlPlan;
|
||||
import nu.marginalia.converting.compiler.InstructionsCompiler;
|
||||
@ -33,7 +34,8 @@ public class ConverterMain {
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConverterModule(plan)
|
||||
new ConverterModule(plan),
|
||||
new DatabaseModule()
|
||||
);
|
||||
|
||||
injector.getInstance(ConverterMain.class);
|
||||
|
@ -0,0 +1,53 @@
|
||||
package nu.marginalia.converting.processor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.db.DomainTypes;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/** Converter-side wrapper for of common:db's DomainTypes,
|
||||
* which is a list of domains of a known type (e.g. blog)
|
||||
*/
|
||||
@Singleton
|
||||
public class ConverterDomainTypes {
|
||||
private final Logger logger = LoggerFactory.getLogger(ConverterDomainTypes.class);
|
||||
private final Map<EdgeDomain, DomainType> domainTypes = new HashMap<>();
|
||||
|
||||
private enum DomainType {
|
||||
BLOG
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ConverterDomainTypes(DomainTypes types) throws SQLException {
|
||||
var allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG);
|
||||
|
||||
if (allBlogs.isEmpty()) {
|
||||
logger.info("No domains of type BLOG found in database, downloading list");
|
||||
try {
|
||||
types.reloadDomainsList(DomainTypes.Type.BLOG);
|
||||
allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to download domains list", ex);
|
||||
}
|
||||
}
|
||||
|
||||
for (var item : allBlogs) {
|
||||
domainTypes.put(new EdgeDomain(item), DomainType.BLOG);
|
||||
}
|
||||
|
||||
logger.info("Loaded {} domain types", domainTypes.size());
|
||||
|
||||
}
|
||||
|
||||
public boolean isBlog(EdgeDomain domain) {
|
||||
return domainTypes.get(domain) == DomainType.BLOG;
|
||||
}
|
||||
}
|
@ -12,13 +12,13 @@ import java.util.List;
|
||||
public class DocumentGeneratorExtractor {
|
||||
private static final String defaultValue = "unset";
|
||||
|
||||
public DocumentGenerator generatorCleaned(Document doc) {
|
||||
public DocumentGenerator detectGenerator(Document doc, String responseHeaders) {
|
||||
|
||||
var tags = doc.select("meta[name=generator]");
|
||||
|
||||
if (tags.size() == 0) {
|
||||
// Some sites have a comment in the head instead of a meta tag
|
||||
return fingerprintByComments(doc);
|
||||
return fingerprintServerTech(doc, responseHeaders);
|
||||
}
|
||||
if (tags.size() > 1) {
|
||||
return DocumentGenerator.multiple();
|
||||
@ -29,11 +29,14 @@ public class DocumentGeneratorExtractor {
|
||||
generator = removePrefixOrSuffix(generator);
|
||||
|
||||
if (generator.isBlank())
|
||||
return DocumentGenerator.unset();
|
||||
return fingerprintServerTech(doc, responseHeaders);
|
||||
|
||||
if (generator.startsWith("AMP by WP"))
|
||||
return DocumentGenerator.of("wordpress", "wordpress-amp");
|
||||
|
||||
String[] parts = StringUtils.split(generator, " ,:!");
|
||||
if (parts.length == 0)
|
||||
return DocumentGenerator.unset();
|
||||
return fingerprintServerTech(doc, responseHeaders);
|
||||
|
||||
int slashIdx = parts[0].indexOf('/');
|
||||
if (slashIdx >= 0) {
|
||||
@ -42,7 +45,7 @@ public class DocumentGeneratorExtractor {
|
||||
}
|
||||
|
||||
if (parts.length > 3) {
|
||||
return DocumentGenerator.unset(); // if it's still very long after trim(), it's probably a custom hand written message
|
||||
return fingerprintServerTech(doc, responseHeaders); // if it's still very long after trim(), it's probably a custom hand written message
|
||||
}
|
||||
|
||||
switch (parts[0]) {
|
||||
@ -73,7 +76,7 @@ public class DocumentGeneratorExtractor {
|
||||
}
|
||||
|
||||
// Fallback logic when there is no meta tag
|
||||
private DocumentGenerator fingerprintByComments(Document doc) {
|
||||
private DocumentGenerator fingerprintServerTech(Document doc, String responseHeaders) {
|
||||
|
||||
for (var comment : doc.getElementsByTag("head").comments()) {
|
||||
String data = comment.getData();
|
||||
@ -81,22 +84,43 @@ public class DocumentGeneratorExtractor {
|
||||
if (data.contains("Generated by javadoc")) {
|
||||
return DocumentGenerator.of("javadoc");
|
||||
}
|
||||
|
||||
if (data.contains("Squarespace")) {
|
||||
return DocumentGenerator.of("squarespace");
|
||||
}
|
||||
if (data.contains("phpBB")) {
|
||||
return DocumentGenerator.of("phpbb");
|
||||
}
|
||||
}
|
||||
|
||||
for (var tag : doc.head().getElementsByTag("script")) {
|
||||
if (tag.html().contains("window.lemmyConfig")) {
|
||||
return DocumentGenerator.of("lemmy");
|
||||
}
|
||||
if (tag.html().contains("URL_DOMAIN = 'wikidot.com'")) {
|
||||
return DocumentGenerator.of("wikidot");
|
||||
}
|
||||
if (tag.attr("src").contains("wp-content")) {
|
||||
String scriptSrc = tag.attr("src");
|
||||
|
||||
if (scriptSrc.contains("wp-content") || scriptSrc.contains("wp-includes")) {
|
||||
return DocumentGenerator.of("wordpress", "wordpress-sneaky");
|
||||
}
|
||||
if (scriptSrc.contains("squarespace.com")) {
|
||||
return DocumentGenerator.of("squarespace");
|
||||
}
|
||||
if (scriptSrc.contains("cdn.cloversites.com")) {
|
||||
return DocumentGenerator.of("cloversites");
|
||||
}
|
||||
if (scriptSrc.contains("bndzgl.com")) {
|
||||
return DocumentGenerator.of("bndzgl");
|
||||
}
|
||||
if (scriptSrc.contains("editmysite.com")) {
|
||||
return DocumentGenerator.of("editmysite");
|
||||
}
|
||||
if (scriptSrc.contains("website-editor.net")) {
|
||||
return DocumentGenerator.of("website-editor.net");
|
||||
}
|
||||
String scriptHtml = tag.html();
|
||||
if (scriptHtml.contains("window.lemmyConfig")) {
|
||||
return DocumentGenerator.of("lemmy");
|
||||
}
|
||||
if (scriptHtml.contains("URL_DOMAIN = 'wikidot.com'")) {
|
||||
return DocumentGenerator.of("wikidot");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (var tag : doc.head().getElementsByTag("link")) {
|
||||
@ -109,6 +133,10 @@ public class DocumentGeneratorExtractor {
|
||||
return DocumentGenerator.of("flarum");
|
||||
}
|
||||
|
||||
if (doc.getElementById("tracpowered") != null) {
|
||||
return DocumentGenerator.of("trac");
|
||||
}
|
||||
|
||||
if (doc.getElementById("_xfClientLoadTime") != null) {
|
||||
return DocumentGenerator.of("xenforo");
|
||||
}
|
||||
@ -117,6 +145,48 @@ public class DocumentGeneratorExtractor {
|
||||
return DocumentGenerator.of("invision");
|
||||
}
|
||||
|
||||
if (doc.getElementById("___gatsby") != null) {
|
||||
return DocumentGenerator.of("gatsby");
|
||||
}
|
||||
|
||||
String[] headers = responseHeaders.toLowerCase().split("\n+");
|
||||
for (var header : headers) {
|
||||
if (header.contains("x-drupal-cache")) {
|
||||
return DocumentGenerator.of("drupal");
|
||||
}
|
||||
if (header.contains("x-powered-by: asp.net")) {
|
||||
return DocumentGenerator.of("asp.net");
|
||||
}
|
||||
if (header.contains("x-powered-by: php")) {
|
||||
return DocumentGenerator.of("php");
|
||||
}
|
||||
if (header.contains("x-powered-by: wp engine")) {
|
||||
return DocumentGenerator.of("wordpress", "wp-engine", "wordpress-sneaky");
|
||||
}
|
||||
if (header.contains("x-powered-by: statamic")) {
|
||||
return DocumentGenerator.of("laravel", "statamic");
|
||||
}
|
||||
}
|
||||
|
||||
// These should be all the way down as they are the most generic
|
||||
for (var header : headers) {
|
||||
if (header.contains("server: mastodon")) {
|
||||
return DocumentGenerator.of("mastodon");
|
||||
}
|
||||
if (header.contains("server: gunicorn")) {
|
||||
return DocumentGenerator.of("gunicorn");
|
||||
}
|
||||
if (header.contains("server: nginx")) {
|
||||
return DocumentGenerator.of("nginx");
|
||||
}
|
||||
if (header.contains("server: apache")) {
|
||||
return DocumentGenerator.of("apache");
|
||||
}
|
||||
if (header.contains("server: cowboy")) {
|
||||
return DocumentGenerator.of("cowboy"); // erlang, really?!
|
||||
}
|
||||
}
|
||||
|
||||
return DocumentGenerator.unset();
|
||||
}
|
||||
|
||||
@ -138,6 +208,11 @@ public class DocumentGeneratorExtractor {
|
||||
generator = generator.substring(0, dashIdx);
|
||||
}
|
||||
|
||||
int parenIdx = generator.indexOf('('); // Some strings have values like 'Drupal 9 (https://www.drupal.org)'
|
||||
if (parenIdx >= 0) {
|
||||
generator = generator.substring(0, parenIdx);
|
||||
}
|
||||
|
||||
if (!StringUtils.isAsciiPrintable(generator))
|
||||
return "";
|
||||
|
||||
@ -170,11 +245,18 @@ public class DocumentGeneratorExtractor {
|
||||
final GeneratorType type = switch (parts[0]) {
|
||||
case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity",
|
||||
"modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms",
|
||||
"typo3", "dotnetnuke", "cms", "coremedia", "dspace"
|
||||
"typo3", "dotnetnuke", "cms", "coremedia", "dspace", "laravel", "trac", "bunnypress", "astro",
|
||||
"ghost", "publii"
|
||||
-> GeneratorType.CMS;
|
||||
case "wix.com", "one.com", "wpbakery", "claris", "wordpress.com", "hubspot",
|
||||
"visual_composer", "mobirise", "everweb", "rapidweaver", "shorthand",
|
||||
"visual", "nitropack",
|
||||
"visual", "nitropack", "squarespace", "editmysite", "websiteeditor.net",
|
||||
|
||||
"svbtle.com", "write.as", "montaigne.io", // blogging platforms, maybe should be in another category?
|
||||
|
||||
"cloversites", // clover is a church-oriented website builder, found that kinda neat
|
||||
"bndzgl", // band websites ..?
|
||||
|
||||
/* these are not SAAS but close enough */
|
||||
"redux", "bootply"
|
||||
-> GeneratorType.SAAS;
|
||||
@ -185,7 +267,8 @@ public class DocumentGeneratorExtractor {
|
||||
"pdf2htmlex", "nvu", "mozilla", "golive", "tenfingers", "publisher",
|
||||
"allaire", "neooffice"
|
||||
-> GeneratorType.BOOMER_STATIC;
|
||||
case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome"
|
||||
case "hugo", "jekyll", "hakyll", "nikola", "zola", "olivetti", "pelican", "sushy", "hexo", "eleventy",
|
||||
"gridsome", "vuepress", "docusaurus", "docpad", "techou", "quarto", "soupault"
|
||||
-> GeneratorType.ZOOMER_STATIC;
|
||||
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano",
|
||||
"notepad.exe", "gedit", "me",
|
||||
@ -198,9 +281,9 @@ public class DocumentGeneratorExtractor {
|
||||
-> GeneratorType.FORUM;
|
||||
case "mediawiki", "dokuwiki", "wikidot", "sharepoint"
|
||||
-> GeneratorType.WIKI;
|
||||
case "pandoc", "mkdocs", "doxygen", "javadoc"
|
||||
case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc"
|
||||
-> GeneratorType.DOCS;
|
||||
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic"
|
||||
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass"
|
||||
-> GeneratorType.ECOMMERCE_AND_SPAM;
|
||||
default
|
||||
-> GeneratorType.UNKNOWN;
|
||||
@ -216,7 +299,7 @@ public class DocumentGeneratorExtractor {
|
||||
public static DocumentGenerator multiple() {
|
||||
// It's *generally* WordPress or the like that injects multiple generator tags
|
||||
|
||||
return new DocumentGenerator(GeneratorType.CMS, List.of(defaultValue));
|
||||
return new DocumentGenerator(GeneratorType.CMS, List.of("wordpress", "wp-best-guess"));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -19,14 +19,13 @@ public class DocumentValuator {
|
||||
int textLength) throws DisqualifiedException {
|
||||
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||
|
||||
int textBodyLength = textLength;
|
||||
int rawLength = crawledDocument.documentBody.length();
|
||||
|
||||
if (textBodyLength == 0) {
|
||||
if (textLength == 0) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
|
||||
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
|
||||
return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
|
||||
+ htmlStandard.offset
|
||||
- scriptPenalty;
|
||||
}
|
||||
|
@ -21,25 +21,29 @@ import java.util.Set;
|
||||
@Singleton
|
||||
public class FeatureExtractor {
|
||||
|
||||
private static final List<String> trackers = List.of("adform.net",
|
||||
private static final List<String> innocentTrackers = List.of(
|
||||
"twitter.com",
|
||||
"bing.com",
|
||||
"msn.com");
|
||||
private static final List<String> shittyTrackers = List.of("adform.net",
|
||||
"connect.facebook",
|
||||
"facebook.com/tr",
|
||||
"googletagmanager.com",
|
||||
"googlesyndication.com",
|
||||
"google.com",
|
||||
"twitter.com",
|
||||
"smartadserver.com",
|
||||
"doubleclick.com",
|
||||
"2mdn.com",
|
||||
"dmtry.com",
|
||||
"bing.com",
|
||||
"msn.com",
|
||||
"amazon-adsystem.com",
|
||||
"alexametrics.com",
|
||||
"rubiconproject.com",
|
||||
"chango.com",
|
||||
"d5nxst8fruw4z.cloudfront.net",
|
||||
"d31qbv1cthcecs.cloudfront.net",
|
||||
"linkedin.com");
|
||||
"linkedin.com",
|
||||
"perfectaudience.com",
|
||||
"marketingautomation.services",
|
||||
"usefathom");
|
||||
|
||||
private final AdblockSimulator adblockSimulator;
|
||||
private final RecipeDetector recipeDetector;
|
||||
@ -71,21 +75,119 @@ public class FeatureExtractor {
|
||||
}
|
||||
|
||||
for (var scriptTag : scriptTags) {
|
||||
if (isJavascriptTag(scriptTag)) {
|
||||
final String type = scriptTag.attr("type");
|
||||
|
||||
if ("application/ld+json".equalsIgnoreCase(type)) {
|
||||
features.add(HtmlFeature.JSON_LD);
|
||||
}
|
||||
else {
|
||||
features.add(HtmlFeature.JS);
|
||||
}
|
||||
}
|
||||
|
||||
if (!doc.head().getElementsByTag("viewport").isEmpty()) {
|
||||
features.add(HtmlFeature.VIEWPORT);
|
||||
}
|
||||
for (var atag : doc.body().getElementsByTag("a")) {
|
||||
var rel = atag.attr("rel");
|
||||
if (rel.equals("dofollow")) {
|
||||
features.add(HtmlFeature.DOFOLLOW_LINK);
|
||||
}
|
||||
}
|
||||
|
||||
if (!doc.getElementsByTag("date").isEmpty()) {
|
||||
features.add(HtmlFeature.DATE_TAG);
|
||||
}
|
||||
if (!doc.getElementsByTag("noscript").isEmpty()) {
|
||||
features.add(HtmlFeature.NOSCRIPT_TAG);
|
||||
}
|
||||
|
||||
|
||||
for (var link : doc.head().getElementsByTag("link")) {
|
||||
|
||||
// 500 IQ web developers use <link> error or load handlers
|
||||
// sneakily load JS without explicit script tags
|
||||
for (var link : doc.head().getElementsByTag("link")) {
|
||||
if (link.hasAttr("onerror")) {
|
||||
if (link.hasAttr("onerror"))
|
||||
features.add(HtmlFeature.JS);
|
||||
break;
|
||||
if (link.hasAttr("onload"))
|
||||
features.add(HtmlFeature.JS);
|
||||
|
||||
if (link.hasAttr("pingback")) {
|
||||
features.add(HtmlFeature.PINGBACK);
|
||||
}
|
||||
if (link.hasAttr("onload")) {
|
||||
features.add(HtmlFeature.JS);
|
||||
break;
|
||||
|
||||
|
||||
var href = link.attr("href");
|
||||
|
||||
if (href.contains("indieauth"))
|
||||
features.add(HtmlFeature.INDIEAUTH);
|
||||
|
||||
var rel = link.attr("rel");
|
||||
|
||||
if (rel.equals("webmention"))
|
||||
features.add(HtmlFeature.WEBMENTION);
|
||||
|
||||
if (rel.equals("me"))
|
||||
features.add(HtmlFeature.ME_TAG);
|
||||
|
||||
if (rel.equals("next"))
|
||||
features.add(HtmlFeature.NEXT_TAG);
|
||||
|
||||
if (rel.equals("alternate") && link.hasAttr("type"))
|
||||
features.add(HtmlFeature.FEED);
|
||||
|
||||
if (rel.equals("dns-prefetch"))
|
||||
features.add(HtmlFeature.DNS_PREFETCH);
|
||||
|
||||
if (rel.equals("preload"))
|
||||
features.add(HtmlFeature.PRELOAD);
|
||||
|
||||
if (rel.equals("preconnect"))
|
||||
features.add(HtmlFeature.PRECONNECT);
|
||||
|
||||
if (rel.equals("amphtml"))
|
||||
features.add(HtmlFeature.AMPHTML);
|
||||
|
||||
if (rel.equals("apple-touch-icon"))
|
||||
features.add(HtmlFeature.APPLE_TOUCH_ICON);
|
||||
|
||||
}
|
||||
|
||||
for (var meta : doc.head().getElementsByTag("meta")) {
|
||||
// <meta name="robots" content="index,follow">
|
||||
if (meta.attr("name").equals("robots")) {
|
||||
var content = meta.attr("content");
|
||||
if (!content.contains("noindex") && content.contains("index")) {
|
||||
features.add(HtmlFeature.ROBOTS_INDEX);
|
||||
}
|
||||
if (!content.contains("nofollow") && content.contains("follow")) {
|
||||
features.add(HtmlFeature.ROBOTS_FOLLOW);
|
||||
}
|
||||
if (content.contains("noodp")) {
|
||||
features.add(HtmlFeature.ROBOTS_NOODP);
|
||||
}
|
||||
if (content.contains("noydir")) {
|
||||
features.add(HtmlFeature.ROBOTS_NOYDIR);
|
||||
}
|
||||
}
|
||||
|
||||
if (meta.attr("profile").contains("gmpg")) {
|
||||
features.add(HtmlFeature.PROFILE_GMPG);
|
||||
}
|
||||
if (meta.attr("property").equals("og:description")) {
|
||||
features.add(HtmlFeature.OPENGRAPH);
|
||||
}
|
||||
if (meta.attr("property").equals("og:image")) {
|
||||
features.add(HtmlFeature.OPENGRAPH_IMAGE);
|
||||
}
|
||||
if (meta.attr("name").equals("twitter:description")) {
|
||||
features.add(HtmlFeature.TWITTERCARD);
|
||||
}
|
||||
if (meta.attr("name").equals("twitter:image")) {
|
||||
features.add(HtmlFeature.TWITTERCARD_IMAGE);
|
||||
}
|
||||
if (meta.attr("http-equiv").equals("origin-trial")) {
|
||||
features.add(HtmlFeature.ORIGIN_TRIAL);
|
||||
}
|
||||
}
|
||||
|
||||
@ -100,14 +202,74 @@ public class FeatureExtractor {
|
||||
}
|
||||
|
||||
for (var scriptTag : scriptTags) {
|
||||
if (hasTrackingScript(scriptTag)) {
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
break;
|
||||
if (hasInvasiveTrackingScript(scriptTag)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
}
|
||||
else if (hasNaiveTrackingScript(scriptTag)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
}
|
||||
|
||||
if (scriptTag.hasAttr("didomi/javascript")) {
|
||||
features.add(HtmlFeature.DIDOMI);
|
||||
}
|
||||
|
||||
String src = scriptTag.attr("src");
|
||||
if (src.contains("OneSignalSDK")) {
|
||||
features.add(HtmlFeature.ONESIGNAL);
|
||||
}
|
||||
|
||||
String scriptText = scriptTag.html();
|
||||
|
||||
if (scriptText.contains("'pd.js'")) {
|
||||
features.add(HtmlFeature.PARDOT);
|
||||
}
|
||||
if (scriptText.contains("https://cmp.quantcast.com")) {
|
||||
features.add(HtmlFeature.QUANTCAST);
|
||||
}
|
||||
if (scriptText.contains("https://quantcast.mgr.consensu.org")) {
|
||||
features.add(HtmlFeature.QUANTCAST);
|
||||
}
|
||||
if (scriptText.contains("https://cdn.cookielaw.org")) {
|
||||
features.add(HtmlFeature.COOKIELAW);
|
||||
}
|
||||
if (scriptText.contains("_linkedin_data_partner_id")) {
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
}
|
||||
if (scriptText.contains("window.OneSignal")) {
|
||||
features.add(HtmlFeature.ONESIGNAL);
|
||||
}
|
||||
if (scriptText.contains("connect.facebook.net")) {
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
}
|
||||
if (scriptText.contains("hotjar.com")) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
}
|
||||
}
|
||||
|
||||
for (var noscript : doc.getElementsByTag("noscript")) {
|
||||
for (var iframe : noscript.getElementsByTag("iframe")) {
|
||||
if (hasInvasiveTrackingScript(iframe)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
}
|
||||
else if (hasNaiveTrackingScript(iframe)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
}
|
||||
}
|
||||
for (var img : noscript.getElementsByTag("img")) {
|
||||
if (hasInvasiveTrackingScript(img)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
}
|
||||
else if (hasNaiveTrackingScript(img)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scriptTags.html().contains("google-analytics.com")) {
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
}
|
||||
|
||||
for (var aTag : doc.getElementsByTag("a")) {
|
||||
@ -129,29 +291,32 @@ public class FeatureExtractor {
|
||||
return features;
|
||||
}
|
||||
|
||||
private boolean hasTrackingScript(Element scriptTag) {
|
||||
return hasTrackingScript(scriptTag.attr("src"));
|
||||
private boolean hasInvasiveTrackingScript(Element scriptTag) {
|
||||
return hasInvasiveTrackingScript(scriptTag.attr("src"));
|
||||
}
|
||||
private boolean hasNaiveTrackingScript(Element scriptTag) {
|
||||
return hasNaiveTrackingScript(scriptTag.attr("src"));
|
||||
}
|
||||
private boolean hasInvasiveTrackingScript(String src) {
|
||||
|
||||
private boolean hasTrackingScript(String scriptText) {
|
||||
|
||||
for (var tracker : trackers) {
|
||||
if (scriptText.contains(tracker)) {
|
||||
for (var tracker : shittyTrackers) {
|
||||
if (src.contains(tracker)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isJavascriptTag(Element scriptTag) {
|
||||
final String type = scriptTag.attr("type");
|
||||
private boolean hasNaiveTrackingScript(String src) {
|
||||
|
||||
if ("application/ld+json".equalsIgnoreCase(type)) {
|
||||
for (var tracker : innocentTrackers) {
|
||||
if (src.contains(tracker)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean isAmazonAffiliateLink(Element aTag) {
|
||||
final String href = aTag.attr("href").toLowerCase();
|
||||
|
@ -111,9 +111,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
|
||||
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||
final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, crawledDocument.headers);
|
||||
|
||||
final var specialization = htmlProcessorSpecializations.select(generatorParts);
|
||||
final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
|
||||
|
||||
if (!specialization.shouldIndex(url)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||
@ -167,7 +167,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
.addGenerator(generatorParts.keywords())
|
||||
.build();
|
||||
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
specialization.amendWords(doc, words);
|
||||
|
||||
getLinks(url, ret, doc, words);
|
||||
|
||||
@ -216,8 +218,23 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
return true;
|
||||
}
|
||||
|
||||
// Annoying wordpress crap
|
||||
if (url.path.startsWith("/tag/") && url.path.endsWith("/")) {
|
||||
// Annoying blog crap
|
||||
if (url.path.contains("/tag/") && url.path.endsWith("/")) {
|
||||
return true;
|
||||
}
|
||||
if (url.path.contains("/tags/") && url.path.endsWith("/")) {
|
||||
return true;
|
||||
}
|
||||
if (url.path.contains("/category/") && url.path.endsWith("/")) {
|
||||
return true;
|
||||
}
|
||||
if (url.path.contains("/categories/") && url.path.endsWith("/")) {
|
||||
return true;
|
||||
}
|
||||
if (url.path.contains("/section/") && url.path.endsWith("/")) {
|
||||
return true;
|
||||
}
|
||||
if (url.path.contains("/sections/") && url.path.endsWith("/")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -0,0 +1,210 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.select.NodeFilter;
|
||||
import org.jsoup.select.NodeVisitor;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** The blog specialization is used for blogs, and makes heavy assumptions about the nature of the document
|
||||
* that aren't generally true, but if the categorization is correct, will yield much better results.
|
||||
*/
|
||||
@Singleton
|
||||
public class BlogSpecialization extends DefaultSpecialization {
|
||||
|
||||
@Inject
|
||||
public BlogSpecialization(SummaryExtractor summaryExtractor) {
|
||||
super(summaryExtractor);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document prune(Document original) {
|
||||
var doc = original.clone();
|
||||
|
||||
// Remove all nav junk, comments and other stuff
|
||||
doc.filter(new BlogPruningFilter());
|
||||
|
||||
// If there is an article tag, use that as the root
|
||||
var articleTags = doc.getElementsByTag("article");
|
||||
var firstArticle = articleTags.first();
|
||||
if (firstArticle != null) {
|
||||
var art = firstArticle.clone();
|
||||
|
||||
doc.body().empty();
|
||||
doc.body().appendChild(art);
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
// Use the default pruning as a fallback
|
||||
return super.prune(doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSummary(Document original, Set<String> importantWords) {
|
||||
return super.getSummary(original, importantWords);
|
||||
}
|
||||
|
||||
private final static List<String> badPathElements =
|
||||
List.of("/tag/", "/tags/", "/tagged/", "/category/", "/categories/", "/section/", "/sections/", "/page/", "/author/");
|
||||
|
||||
private final static Predicate<String> dateIndexTest1 = Pattern.compile("^/(\\d{4}/(\\d{2}/){0,2}?)$").asMatchPredicate();
|
||||
private final static Predicate<String> dateIndexTest2 = Pattern.compile("^/(\\d{2}/){1,2}$").asMatchPredicate();
|
||||
|
||||
@Override
|
||||
public boolean shouldIndex(EdgeUrl url) {
|
||||
String path = url.path;
|
||||
|
||||
// Don't index the root path for blogs, as it is usually an ephemeral list of all posts
|
||||
if ("/".equals(path)) return false;
|
||||
|
||||
// Likewise for the blog's home page
|
||||
if (path.endsWith("/blog/")) return false;
|
||||
if (path.endsWith("/log/")) return false;
|
||||
if (path.endsWith("/weblog/")) return false;
|
||||
if (path.endsWith("/posts/")) return false;
|
||||
if (path.endsWith("/articles/")) return false;
|
||||
|
||||
// Refuse paths that contain any of the bad path elements
|
||||
for (String badPathElement : badPathElements) {
|
||||
if (path.contains(badPathElement)) return false;
|
||||
}
|
||||
|
||||
// We don't want chronological listings
|
||||
if (dateIndexTest1.test(path)) return false;
|
||||
if (dateIndexTest2.test(path)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static PorterStemmer ps = new PorterStemmer();
|
||||
public void amendWords(Document doc, DocumentKeywordsBuilder words) {
|
||||
var tagExtractor = new BlogTagExtractor();
|
||||
doc.traverse(tagExtractor);
|
||||
|
||||
var tags = tagExtractor.getTags();
|
||||
if (!tags.isEmpty()) {
|
||||
var stemmed = tags.stream().map(ps::stemWord).collect(Collectors.toSet());
|
||||
words.setFlagOnMetadataForWords(WordFlags.Subjects, stemmed);
|
||||
|
||||
Set<String> specialTags = tags.stream().map(s -> "tag:" + s).collect(Collectors.toSet());
|
||||
words.addAllSyntheticTerms(specialTags);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Removes all the non-content elements from the document,
|
||||
* making strong blog-specific assumptions about the nature of
|
||||
* the layout */
|
||||
private static class BlogPruningFilter implements NodeFilter {
|
||||
private static final List<String> badClassElements = Arrays.asList("comment", "reply", "sidebar", "header", "footer", "nav");
|
||||
private static final List<String> badIdElements = Arrays.asList("comments", "header", "footer", "nav");
|
||||
|
||||
@Override
|
||||
public FilterResult head(Node node, int depth) {
|
||||
if (node instanceof Element el) {
|
||||
String classes = el.attr("class");
|
||||
String id = el.id();
|
||||
|
||||
for (String badClassElement : badClassElements) {
|
||||
if (classes.contains(badClassElement)) {
|
||||
return FilterResult.REMOVE;
|
||||
}
|
||||
}
|
||||
for (String badIdElement : badIdElements) {
|
||||
if (id.contains(badIdElement)) {
|
||||
return FilterResult.REMOVE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Extract tag keywords from the blog post
|
||||
public static class BlogTagExtractor implements NodeVisitor {
|
||||
private final Set<String> tags = new HashSet<>();
|
||||
int lookForTags = -1;
|
||||
|
||||
public Set<String> getTags() {
|
||||
Set<String> tagsClean = tags.stream().map(String::toLowerCase).map(this::cleanTag).filter(Strings::isNotBlank).collect(Collectors.toSet());
|
||||
|
||||
// If there are more than 5 tags, it's probably a global tag listing
|
||||
// and not a post-specific tag listing
|
||||
if (tagsClean.size() > 5)
|
||||
return Set.of();
|
||||
|
||||
return tagsClean;
|
||||
}
|
||||
|
||||
private final Pattern splitterPattern = Pattern.compile("\\s+");
|
||||
private final Pattern noisePattern = Pattern.compile("[^a-zA-Z0-9]");
|
||||
|
||||
// This is hideously expensive but blog posts are relatively few and far between
|
||||
private String cleanTag(String tag) {
|
||||
|
||||
String[] parts = splitterPattern.split(tag);
|
||||
|
||||
if (parts.length > 3)
|
||||
return "";
|
||||
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
if (parts[i].startsWith("#"))
|
||||
parts[i] = parts[i].substring(1);
|
||||
else if (parts[i].startsWith("(") && parts[i].endsWith(")"))
|
||||
parts[i] = "";
|
||||
else
|
||||
parts[i] = noisePattern.matcher(parts[i]).replaceAll("");
|
||||
|
||||
if (parts[i].equals("tags"))
|
||||
parts[i] = "";
|
||||
}
|
||||
|
||||
|
||||
return Arrays.stream(parts).filter(Strings::isNotBlank).collect(Collectors.joining("_"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void head(Node node, int depth) {
|
||||
|
||||
if (!(node instanceof Element el)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (lookForTags < 0) {
|
||||
if (el.attr("class").contains("tags")) {
|
||||
lookForTags = depth;
|
||||
}
|
||||
if (el.tagName().equals("a")) {
|
||||
if (el.attr("class").contains("tag")
|
||||
|| el.attr("href").startsWith("/tag/"))
|
||||
tags.add(el.text());
|
||||
}
|
||||
}
|
||||
else if (el.tagName().equals("a")) {
|
||||
tags.add(el.text());
|
||||
}
|
||||
|
||||
}
|
||||
public void tail(Node node, int depth) {
|
||||
if (depth <= lookForTags) { lookForTags = -1; }
|
||||
}
|
||||
}
|
||||
}
|
@ -2,7 +2,9 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
@ -10,27 +12,41 @@ import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class HtmlProcessorSpecializations {
|
||||
private final ConverterDomainTypes domainTypes;
|
||||
private final LemmySpecialization lemmySpecialization;
|
||||
private final XenForoSpecialization xenforoSpecialization;
|
||||
private final PhpBBSpecialization phpBBSpecialization;
|
||||
private final JavadocSpecialization javadocSpecialization;
|
||||
private final BlogSpecialization blogSpecialization;
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
|
||||
@Inject
|
||||
public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization,
|
||||
public HtmlProcessorSpecializations(ConverterDomainTypes domainTypes,
|
||||
LemmySpecialization lemmySpecialization,
|
||||
XenForoSpecialization xenforoSpecialization,
|
||||
PhpBBSpecialization phpBBSpecialization,
|
||||
JavadocSpecialization javadocSpecialization,
|
||||
BlogSpecialization blogSpecialization,
|
||||
DefaultSpecialization defaultSpecialization) {
|
||||
this.domainTypes = domainTypes;
|
||||
this.lemmySpecialization = lemmySpecialization;
|
||||
this.xenforoSpecialization = xenforoSpecialization;
|
||||
this.phpBBSpecialization = phpBBSpecialization;
|
||||
this.javadocSpecialization = javadocSpecialization;
|
||||
this.blogSpecialization = blogSpecialization;
|
||||
this.defaultSpecialization = defaultSpecialization;
|
||||
}
|
||||
|
||||
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
|
||||
public HtmlProcessorSpecializationIf select(DocumentGeneratorExtractor.DocumentGenerator generator) {
|
||||
public HtmlProcessorSpecializationIf select(
|
||||
DocumentGeneratorExtractor.DocumentGenerator generator,
|
||||
EdgeUrl url)
|
||||
{
|
||||
|
||||
if (domainTypes.isBlog(url.domain)) {
|
||||
return blogSpecialization;
|
||||
}
|
||||
|
||||
if (generator.keywords().contains("lemmy")) {
|
||||
return lemmySpecialization;
|
||||
}
|
||||
@ -58,5 +74,8 @@ public class HtmlProcessorSpecializations {
|
||||
|
||||
default boolean shouldIndex(EdgeUrl url) { return true; }
|
||||
default double lengthModifier() { return 1.0; }
|
||||
|
||||
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,8 @@ import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
public class ConvertingIntegrationTestModule extends AbstractModule {
|
||||
public void configure() {
|
||||
@ -13,5 +15,6 @@ public class ConvertingIntegrationTestModule extends AbstractModule {
|
||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class));
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,17 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class BlogSpecializationTest {
|
||||
|
||||
@Test
|
||||
void shouldIndex() throws Exception {
|
||||
var spec = new BlogSpecialization(null);
|
||||
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/22/")));
|
||||
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/")));
|
||||
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/00/22/")));
|
||||
}
|
||||
}
|
@ -34,7 +34,7 @@ class JavadocSpecializationTest {
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread));
|
||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), "");
|
||||
|
||||
System.out.println(gen);
|
||||
}
|
||||
|
@ -37,8 +37,8 @@ class LemmySpecializationTest {
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var generatorIndex = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyIndexHtml));
|
||||
var generatorPost = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyPost));
|
||||
var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), "");
|
||||
var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), "");
|
||||
|
||||
System.out.println(generatorIndex);
|
||||
System.out.println(generatorPost);
|
||||
|
@ -34,7 +34,7 @@ class XenForoSpecializationTest {
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread));
|
||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), "");
|
||||
|
||||
System.out.println(gen);
|
||||
}
|
||||
|
@ -20,7 +20,8 @@ public class LinkFilterSelector {
|
||||
}
|
||||
|
||||
if (isLemmy(head)) {
|
||||
return url -> url.path.startsWith("/post/") || url.path.startsWith("/c/");
|
||||
return url -> url.path.startsWith("/post/")
|
||||
|| (url.path.startsWith("/c/") && !url.path.contains("@"));
|
||||
}
|
||||
if (isDiscourse(head)) {
|
||||
return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
|
||||
|
@ -211,7 +211,7 @@ public class IndexQueryService {
|
||||
return switch (priority) {
|
||||
case BEST -> false;
|
||||
case GOOD -> resultCount > params.fetchSize / 4;
|
||||
case FALLBACK -> resultCount > params.fetchSize / 256;
|
||||
case FALLBACK -> resultCount > params.fetchSize / 8;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -121,7 +121,7 @@ public class UrlDetails {
|
||||
|
||||
for (var problem :EnumSet.of(
|
||||
HtmlFeature.JS,
|
||||
HtmlFeature.TRACKING,
|
||||
HtmlFeature.TRACKING_INNOCENT,
|
||||
HtmlFeature.AFFILIATE_LINK,
|
||||
HtmlFeature.COOKIES,
|
||||
HtmlFeature.ADVERTISEMENT)) {
|
||||
@ -156,7 +156,7 @@ public class UrlDetails {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
|
||||
}
|
||||
public boolean isTracking() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT);
|
||||
}
|
||||
public boolean isAffiliate() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
|
||||
|
@ -1,18 +1,12 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.BlogSpecialization;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
public class DebugConverterExperiment extends Experiment {
|
||||
|
||||
|
||||
@ -24,56 +18,25 @@ public class DebugConverterExperiment extends Experiment {
|
||||
|
||||
}
|
||||
|
||||
Set<String> seenGenerators = new HashSet<>();
|
||||
|
||||
@Override
|
||||
public boolean process(CrawledDomain domain) {
|
||||
|
||||
if (domain.doc == null) return true;
|
||||
|
||||
var dge = new DocumentGeneratorExtractor();
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
if (doc.documentBody == null) continue;
|
||||
|
||||
var parsed = Jsoup.parse(doc.documentBody.decode());
|
||||
parsed.getElementsByTag("head").comments()
|
||||
.stream().filter(c -> {
|
||||
String data = c.getData();
|
||||
if (data.contains("<script"))
|
||||
return false;
|
||||
if (data.contains("[if"))
|
||||
return false;
|
||||
if (data.contains("shim"))
|
||||
return false;
|
||||
return data.contains("Generated by") || data.contains("generated by")
|
||||
|| data.contains("Powered by") || data.contains("powered by");
|
||||
}).forEach(System.out::println);
|
||||
|
||||
var generators = dge.generatorCleaned(parsed);
|
||||
for (var g : generators.keywords()) {
|
||||
if (seenGenerators.add(g)) {
|
||||
System.out.println(g + "->" + generators.type());
|
||||
if (generators.type() == GeneratorType.UNKNOWN) {
|
||||
System.out.println(parsed.select("meta[name=generator]")
|
||||
.attr("content"));
|
||||
System.out.println(doc.url);
|
||||
}
|
||||
}
|
||||
var tagExtractor = new BlogSpecialization.BlogTagExtractor();
|
||||
parsed.traverse(tagExtractor);
|
||||
var tags = tagExtractor.getTags();
|
||||
if (!tags.isEmpty()) {
|
||||
System.out.println(tags);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// var ret = domainProcessor.process(domain);
|
||||
//
|
||||
//
|
||||
// ret.documents.stream()
|
||||
// .filter(ProcessedDocument::isProcessedFully)
|
||||
// .peek(d -> System.out.println(d.url))
|
||||
// .map(d -> d.details.metadata)
|
||||
// .forEach(System.out::println);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user