mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
Better fingerprinting (#35)
* Better fingerprinting for server tech * Many more features in FeatureExtractor * Blog specialization * SiteType table
This commit is contained in:
parent
ae9537b68e
commit
0f9b90eb1c
@ -2,6 +2,7 @@ plugins {
|
|||||||
id 'java'
|
id 'java'
|
||||||
id "io.freefair.lombok" version "5.3.3.3"
|
id "io.freefair.lombok" version "5.3.3.3"
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
java {
|
java {
|
||||||
@ -32,8 +33,14 @@ dependencies {
|
|||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
|
||||||
|
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||||
|
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||||
|
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
test {
|
test {
|
||||||
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||||
maxHeapSize = "8G"
|
maxHeapSize = "8G"
|
||||||
@ -47,4 +54,3 @@ task fastTests(type: Test) {
|
|||||||
excludeTags "slow"
|
excludeTags "slow"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
179
code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java
Normal file
179
code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
package nu.marginalia.db;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.id.EdgeIdList;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
|
import javax.inject.Inject;
|
||||||
|
import javax.inject.Singleton;
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/** A list of domains that are known to be of a certain type */
|
||||||
|
@Singleton
|
||||||
|
public class DomainTypes {
|
||||||
|
|
||||||
|
public enum Type {
|
||||||
|
BLOG,
|
||||||
|
TEST
|
||||||
|
};
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(DomainTypes.class);
|
||||||
|
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomainTypes(HikariDataSource dataSource) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Get all domains of a certain type, including domains that are not in the EC_DOMAIN table */
|
||||||
|
public List<String> getAllDomainsByType(Type type) {
|
||||||
|
List<String> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_NAME
|
||||||
|
FROM DOMAIN_SELECTION INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID
|
||||||
|
WHERE DOMAIN_SELECTION_TYPE.NAME = ?
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setString(1, type.name());
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
ret.add(rs.getString(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Retrieve the EdgeId of all domains of a certain type,
|
||||||
|
* ignoring entries that are not in the EC_DOMAIN table */
|
||||||
|
public EdgeIdList<EdgeDomain> getKnownDomainsByType(Type type) {
|
||||||
|
EdgeIdList<EdgeDomain> ret = new EdgeIdList<>();
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT EC_DOMAIN.ID
|
||||||
|
FROM DOMAIN_SELECTION
|
||||||
|
INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID
|
||||||
|
INNER JOIN EC_DOMAIN ON DOMAIN_SELECTION.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
|
||||||
|
WHERE DOMAIN_SELECTION_TYPE.NAME = ?
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setString(1, type.name());
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
ret.add(rs.getInt(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Reload the list of domains of a certain type from the source */
|
||||||
|
public void reloadDomainsList(Type type) throws IOException, SQLException {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT SOURCE, ID FROM DOMAIN_SELECTION_TYPE WHERE NAME = ?
|
||||||
|
""");
|
||||||
|
var deleteStatement = conn.prepareStatement("""
|
||||||
|
DELETE FROM DOMAIN_SELECTION WHERE DOMAIN_TYPE_ID = ?
|
||||||
|
""");
|
||||||
|
var insertStatement = conn.prepareStatement("""
|
||||||
|
INSERT IGNORE INTO DOMAIN_SELECTION (DOMAIN_NAME, DOMAIN_TYPE_ID) VALUES (?, ?)
|
||||||
|
""")
|
||||||
|
)
|
||||||
|
{
|
||||||
|
stmt.setString(1, type.name());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
|
||||||
|
if (!rsp.next()) {
|
||||||
|
throw new RuntimeException("No such domain selection type: " + type);
|
||||||
|
}
|
||||||
|
|
||||||
|
var source = rsp.getString(1);
|
||||||
|
int typeId = rsp.getInt(2);
|
||||||
|
|
||||||
|
List<String> downloadDomains = downloadDomainsList(source);
|
||||||
|
|
||||||
|
try {
|
||||||
|
conn.setAutoCommit(false);
|
||||||
|
deleteStatement.setInt(1, typeId);
|
||||||
|
deleteStatement.executeUpdate();
|
||||||
|
|
||||||
|
for (String domain : downloadDomains) {
|
||||||
|
insertStatement.setString(1, domain);
|
||||||
|
insertStatement.setInt(2, typeId);
|
||||||
|
insertStatement.executeUpdate();
|
||||||
|
// Could use batch insert here, but this executes infrequently, so it's not worth the hassle
|
||||||
|
}
|
||||||
|
|
||||||
|
conn.commit();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
conn.rollback();
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
conn.setAutoCommit(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> downloadDomainsList(String source) throws IOException {
|
||||||
|
List<String> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
logger.info("Downloading domain list from {}", source);
|
||||||
|
|
||||||
|
try (var br = new BufferedReader(new InputStreamReader(new URL(source).openStream()))) {
|
||||||
|
String line;
|
||||||
|
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
line = cleanDomainListLine(line);
|
||||||
|
|
||||||
|
|
||||||
|
if (isValidDomainListEntry(line))
|
||||||
|
ret.add(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("-- found {}", ret.size());
|
||||||
|
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String cleanDomainListLine(String line) {
|
||||||
|
line = line.trim();
|
||||||
|
|
||||||
|
int hashIdx = line.indexOf('#');
|
||||||
|
if (hashIdx >= 0)
|
||||||
|
line = line.substring(0, hashIdx).trim();
|
||||||
|
|
||||||
|
return line;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isValidDomainListEntry(String line) {
|
||||||
|
if (line.isBlank())
|
||||||
|
return false;
|
||||||
|
if (!line.matches("[a-z0-9\\-.]+"))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,19 @@
|
|||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_SELECTION_TYPE (
|
||||||
|
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||||
|
NAME VARCHAR(255) UNIQUE,
|
||||||
|
SOURCE VARCHAR(255) NOT NULL
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4
|
||||||
|
COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
CREATE TABLE DOMAIN_SELECTION (
|
||||||
|
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
|
||||||
|
DOMAIN_TYPE_ID INT,
|
||||||
|
FOREIGN KEY (DOMAIN_TYPE_ID) REFERENCES DOMAIN_SELECTION_TYPE(ID) ON DELETE CASCADE
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4
|
||||||
|
COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
INSERT IGNORE INTO DOMAIN_SELECTION_TYPE(NAME, SOURCE)
|
||||||
|
VALUES ('BLOG', 'https://raw.githubusercontent.com/MarginaliaSearch/submit-site-to-marginalia-search/master/blogs.txt'),
|
||||||
|
('TEST', 'https://downloads.marginalia.nu/domain-list-test.txt');
|
@ -0,0 +1,63 @@
|
|||||||
|
package nu.marginalia.db;
|
||||||
|
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import org.junit.jupiter.api.AfterAll;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.testcontainers.containers.MariaDBContainer;
|
||||||
|
import org.testcontainers.junit.jupiter.Container;
|
||||||
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
@Testcontainers
|
||||||
|
public class DomainTypesTest {
|
||||||
|
@Container
|
||||||
|
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||||
|
.withDatabaseName("WMSA_prod")
|
||||||
|
.withUsername("wmsa")
|
||||||
|
.withPassword("wmsa")
|
||||||
|
.withInitScript("sql/current/10-domain-type.sql")
|
||||||
|
.withNetworkAliases("mariadb");
|
||||||
|
|
||||||
|
static HikariDataSource dataSource;
|
||||||
|
static DomainTypes domainTypes;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setup() {
|
||||||
|
HikariConfig config = new HikariConfig();
|
||||||
|
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||||
|
config.setUsername("wmsa");
|
||||||
|
config.setPassword("wmsa");
|
||||||
|
|
||||||
|
dataSource = new HikariDataSource(config);
|
||||||
|
|
||||||
|
domainTypes = new DomainTypes(dataSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void teardown() {
|
||||||
|
dataSource.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void reloadDomainsList() throws SQLException, IOException {
|
||||||
|
domainTypes.reloadDomainsList(DomainTypes.Type.TEST);
|
||||||
|
|
||||||
|
var downloadedDomains = new HashSet<>(domainTypes.getAllDomainsByType(DomainTypes.Type.TEST));
|
||||||
|
|
||||||
|
var expectedDomains = Set.of("www.marginalia.nu", "search.marginalia.nu",
|
||||||
|
"encyclopedia.marginalia.nu", "memex.marginalia.nu");
|
||||||
|
|
||||||
|
assertEquals(4, downloadedDomains.size());
|
||||||
|
assertEquals(Set.of(), Sets.symmetricDifference(expectedDomains, downloadedDomains));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -6,7 +6,10 @@ public enum HtmlFeature {
|
|||||||
MEDIA( "special:media"),
|
MEDIA( "special:media"),
|
||||||
JS("special:scripts"),
|
JS("special:scripts"),
|
||||||
AFFILIATE_LINK( "special:affiliate"),
|
AFFILIATE_LINK( "special:affiliate"),
|
||||||
TRACKING("special:tracking"),
|
TRACKING_INNOCENT("special:tracking"),
|
||||||
|
TRACKING_EVIL("special:tracking2"),
|
||||||
|
|
||||||
|
VIEWPORT("special:viewport"),
|
||||||
|
|
||||||
COOKIES("special:cookies"),
|
COOKIES("special:cookies"),
|
||||||
CATEGORY_FOOD("category:food"),
|
CATEGORY_FOOD("category:food"),
|
||||||
@ -15,8 +18,43 @@ public enum HtmlFeature {
|
|||||||
|
|
||||||
GA_SPAM("special:gaspam"),
|
GA_SPAM("special:gaspam"),
|
||||||
|
|
||||||
UNKNOWN("special:uncategorized")
|
/** For fingerprinting and ranking */
|
||||||
;
|
OPENGRAPH("special:opengraph"),
|
||||||
|
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||||
|
TWITTERCARD("special:twittercard"),
|
||||||
|
TWITTERCARD_IMAGE("special:twittercard:image"),
|
||||||
|
FONTAWSESOME("special:fontawesome"),
|
||||||
|
GOOGLEFONTS("special:googlefonts"),
|
||||||
|
DNS_PREFETCH("special:dnsprefetch"),
|
||||||
|
PRELOAD("special:preload"),
|
||||||
|
PRECONNECT("special:preconnect"),
|
||||||
|
PINGBACK("special:pingback"),
|
||||||
|
FEED("special:feed"),
|
||||||
|
WEBMENTION("special:webmention"),
|
||||||
|
INDIEAUTH("special:indieauth"),
|
||||||
|
ME_TAG("special:metag"),
|
||||||
|
NEXT_TAG("special:nexttag"),
|
||||||
|
AMPHTML("special:amphtml"),
|
||||||
|
JSON_LD("special:jsonld"),
|
||||||
|
ORIGIN_TRIAL("special:origintrial"),
|
||||||
|
PROFILE_GMPG("special:profile-gpmg"),
|
||||||
|
QUANTCAST("special:quantcast"),
|
||||||
|
COOKIELAW("special:cookielaw"),
|
||||||
|
DIDOMI("special:didomi"),
|
||||||
|
PARDOT("special:pardot"),
|
||||||
|
ONESIGNAL("special:onesignal"),
|
||||||
|
DATE_TAG("special:date_tag"),
|
||||||
|
NOSCRIPT_TAG("special:noscript_tag"),
|
||||||
|
|
||||||
|
ROBOTS_INDEX("robots:index"),
|
||||||
|
ROBOTS_FOLLOW("robots:follow"),
|
||||||
|
ROBOTS_NOODP("robots:noodp"),
|
||||||
|
ROBOTS_NOYDIR("robots:noydir"),
|
||||||
|
DOFOLLOW_LINK("special:dofollow"),
|
||||||
|
APPLE_TOUCH_ICON("special:appleicon"),
|
||||||
|
|
||||||
|
UNKNOWN("special:uncategorized");
|
||||||
|
|
||||||
|
|
||||||
private final String keyword;
|
private final String keyword;
|
||||||
|
|
||||||
|
@ -17,6 +17,15 @@ public class DocumentLanguageData {
|
|||||||
public final DocumentSentence[] titleSentences;
|
public final DocumentSentence[] titleSentences;
|
||||||
public final TObjectIntHashMap<String> wordCount;
|
public final TObjectIntHashMap<String> wordCount;
|
||||||
|
|
||||||
|
/** for test convenience */
|
||||||
|
public static DocumentLanguageData empty() {
|
||||||
|
return new DocumentLanguageData(
|
||||||
|
new DocumentSentence[0],
|
||||||
|
new DocumentSentence[0],
|
||||||
|
new TObjectIntHashMap<>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public int totalNumWords() {
|
public int totalNumWords() {
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
for (int i = 0; i < sentences.length; i++) {
|
for (int i = 0; i < sentences.length; i++) {
|
||||||
|
@ -29,6 +29,7 @@ dependencies {
|
|||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
|
@ -5,6 +5,7 @@ import com.google.inject.Guice;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import plan.CrawlPlanLoader;
|
import plan.CrawlPlanLoader;
|
||||||
import plan.CrawlPlan;
|
import plan.CrawlPlan;
|
||||||
import nu.marginalia.converting.compiler.InstructionsCompiler;
|
import nu.marginalia.converting.compiler.InstructionsCompiler;
|
||||||
@ -33,7 +34,8 @@ public class ConverterMain {
|
|||||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
Injector injector = Guice.createInjector(
|
Injector injector = Guice.createInjector(
|
||||||
new ConverterModule(plan)
|
new ConverterModule(plan),
|
||||||
|
new DatabaseModule()
|
||||||
);
|
);
|
||||||
|
|
||||||
injector.getInstance(ConverterMain.class);
|
injector.getInstance(ConverterMain.class);
|
||||||
|
@ -0,0 +1,53 @@
|
|||||||
|
package nu.marginalia.converting.processor;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.db.DomainTypes;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/** Converter-side wrapper for of common:db's DomainTypes,
|
||||||
|
* which is a list of domains of a known type (e.g. blog)
|
||||||
|
*/
|
||||||
|
@Singleton
|
||||||
|
public class ConverterDomainTypes {
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(ConverterDomainTypes.class);
|
||||||
|
private final Map<EdgeDomain, DomainType> domainTypes = new HashMap<>();
|
||||||
|
|
||||||
|
private enum DomainType {
|
||||||
|
BLOG
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ConverterDomainTypes(DomainTypes types) throws SQLException {
|
||||||
|
var allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG);
|
||||||
|
|
||||||
|
if (allBlogs.isEmpty()) {
|
||||||
|
logger.info("No domains of type BLOG found in database, downloading list");
|
||||||
|
try {
|
||||||
|
types.reloadDomainsList(DomainTypes.Type.BLOG);
|
||||||
|
allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
logger.error("Failed to download domains list", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var item : allBlogs) {
|
||||||
|
domainTypes.put(new EdgeDomain(item), DomainType.BLOG);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Loaded {} domain types", domainTypes.size());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBlog(EdgeDomain domain) {
|
||||||
|
return domainTypes.get(domain) == DomainType.BLOG;
|
||||||
|
}
|
||||||
|
}
|
@ -12,13 +12,13 @@ import java.util.List;
|
|||||||
public class DocumentGeneratorExtractor {
|
public class DocumentGeneratorExtractor {
|
||||||
private static final String defaultValue = "unset";
|
private static final String defaultValue = "unset";
|
||||||
|
|
||||||
public DocumentGenerator generatorCleaned(Document doc) {
|
public DocumentGenerator detectGenerator(Document doc, String responseHeaders) {
|
||||||
|
|
||||||
var tags = doc.select("meta[name=generator]");
|
var tags = doc.select("meta[name=generator]");
|
||||||
|
|
||||||
if (tags.size() == 0) {
|
if (tags.size() == 0) {
|
||||||
// Some sites have a comment in the head instead of a meta tag
|
// Some sites have a comment in the head instead of a meta tag
|
||||||
return fingerprintByComments(doc);
|
return fingerprintServerTech(doc, responseHeaders);
|
||||||
}
|
}
|
||||||
if (tags.size() > 1) {
|
if (tags.size() > 1) {
|
||||||
return DocumentGenerator.multiple();
|
return DocumentGenerator.multiple();
|
||||||
@ -29,11 +29,14 @@ public class DocumentGeneratorExtractor {
|
|||||||
generator = removePrefixOrSuffix(generator);
|
generator = removePrefixOrSuffix(generator);
|
||||||
|
|
||||||
if (generator.isBlank())
|
if (generator.isBlank())
|
||||||
return DocumentGenerator.unset();
|
return fingerprintServerTech(doc, responseHeaders);
|
||||||
|
|
||||||
|
if (generator.startsWith("AMP by WP"))
|
||||||
|
return DocumentGenerator.of("wordpress", "wordpress-amp");
|
||||||
|
|
||||||
String[] parts = StringUtils.split(generator, " ,:!");
|
String[] parts = StringUtils.split(generator, " ,:!");
|
||||||
if (parts.length == 0)
|
if (parts.length == 0)
|
||||||
return DocumentGenerator.unset();
|
return fingerprintServerTech(doc, responseHeaders);
|
||||||
|
|
||||||
int slashIdx = parts[0].indexOf('/');
|
int slashIdx = parts[0].indexOf('/');
|
||||||
if (slashIdx >= 0) {
|
if (slashIdx >= 0) {
|
||||||
@ -42,7 +45,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (parts.length > 3) {
|
if (parts.length > 3) {
|
||||||
return DocumentGenerator.unset(); // if it's still very long after trim(), it's probably a custom hand written message
|
return fingerprintServerTech(doc, responseHeaders); // if it's still very long after trim(), it's probably a custom hand written message
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (parts[0]) {
|
switch (parts[0]) {
|
||||||
@ -73,7 +76,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fallback logic when there is no meta tag
|
// Fallback logic when there is no meta tag
|
||||||
private DocumentGenerator fingerprintByComments(Document doc) {
|
private DocumentGenerator fingerprintServerTech(Document doc, String responseHeaders) {
|
||||||
|
|
||||||
for (var comment : doc.getElementsByTag("head").comments()) {
|
for (var comment : doc.getElementsByTag("head").comments()) {
|
||||||
String data = comment.getData();
|
String data = comment.getData();
|
||||||
@ -81,22 +84,43 @@ public class DocumentGeneratorExtractor {
|
|||||||
if (data.contains("Generated by javadoc")) {
|
if (data.contains("Generated by javadoc")) {
|
||||||
return DocumentGenerator.of("javadoc");
|
return DocumentGenerator.of("javadoc");
|
||||||
}
|
}
|
||||||
|
if (data.contains("Squarespace")) {
|
||||||
|
return DocumentGenerator.of("squarespace");
|
||||||
|
}
|
||||||
if (data.contains("phpBB")) {
|
if (data.contains("phpBB")) {
|
||||||
return DocumentGenerator.of("phpbb");
|
return DocumentGenerator.of("phpbb");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var tag : doc.head().getElementsByTag("script")) {
|
for (var tag : doc.head().getElementsByTag("script")) {
|
||||||
if (tag.html().contains("window.lemmyConfig")) {
|
String scriptSrc = tag.attr("src");
|
||||||
return DocumentGenerator.of("lemmy");
|
|
||||||
}
|
if (scriptSrc.contains("wp-content") || scriptSrc.contains("wp-includes")) {
|
||||||
if (tag.html().contains("URL_DOMAIN = 'wikidot.com'")) {
|
|
||||||
return DocumentGenerator.of("wikidot");
|
|
||||||
}
|
|
||||||
if (tag.attr("src").contains("wp-content")) {
|
|
||||||
return DocumentGenerator.of("wordpress", "wordpress-sneaky");
|
return DocumentGenerator.of("wordpress", "wordpress-sneaky");
|
||||||
}
|
}
|
||||||
|
if (scriptSrc.contains("squarespace.com")) {
|
||||||
|
return DocumentGenerator.of("squarespace");
|
||||||
|
}
|
||||||
|
if (scriptSrc.contains("cdn.cloversites.com")) {
|
||||||
|
return DocumentGenerator.of("cloversites");
|
||||||
|
}
|
||||||
|
if (scriptSrc.contains("bndzgl.com")) {
|
||||||
|
return DocumentGenerator.of("bndzgl");
|
||||||
|
}
|
||||||
|
if (scriptSrc.contains("editmysite.com")) {
|
||||||
|
return DocumentGenerator.of("editmysite");
|
||||||
|
}
|
||||||
|
if (scriptSrc.contains("website-editor.net")) {
|
||||||
|
return DocumentGenerator.of("website-editor.net");
|
||||||
|
}
|
||||||
|
String scriptHtml = tag.html();
|
||||||
|
if (scriptHtml.contains("window.lemmyConfig")) {
|
||||||
|
return DocumentGenerator.of("lemmy");
|
||||||
|
}
|
||||||
|
if (scriptHtml.contains("URL_DOMAIN = 'wikidot.com'")) {
|
||||||
|
return DocumentGenerator.of("wikidot");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var tag : doc.head().getElementsByTag("link")) {
|
for (var tag : doc.head().getElementsByTag("link")) {
|
||||||
@ -109,6 +133,10 @@ public class DocumentGeneratorExtractor {
|
|||||||
return DocumentGenerator.of("flarum");
|
return DocumentGenerator.of("flarum");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (doc.getElementById("tracpowered") != null) {
|
||||||
|
return DocumentGenerator.of("trac");
|
||||||
|
}
|
||||||
|
|
||||||
if (doc.getElementById("_xfClientLoadTime") != null) {
|
if (doc.getElementById("_xfClientLoadTime") != null) {
|
||||||
return DocumentGenerator.of("xenforo");
|
return DocumentGenerator.of("xenforo");
|
||||||
}
|
}
|
||||||
@ -117,6 +145,48 @@ public class DocumentGeneratorExtractor {
|
|||||||
return DocumentGenerator.of("invision");
|
return DocumentGenerator.of("invision");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (doc.getElementById("___gatsby") != null) {
|
||||||
|
return DocumentGenerator.of("gatsby");
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] headers = responseHeaders.toLowerCase().split("\n+");
|
||||||
|
for (var header : headers) {
|
||||||
|
if (header.contains("x-drupal-cache")) {
|
||||||
|
return DocumentGenerator.of("drupal");
|
||||||
|
}
|
||||||
|
if (header.contains("x-powered-by: asp.net")) {
|
||||||
|
return DocumentGenerator.of("asp.net");
|
||||||
|
}
|
||||||
|
if (header.contains("x-powered-by: php")) {
|
||||||
|
return DocumentGenerator.of("php");
|
||||||
|
}
|
||||||
|
if (header.contains("x-powered-by: wp engine")) {
|
||||||
|
return DocumentGenerator.of("wordpress", "wp-engine", "wordpress-sneaky");
|
||||||
|
}
|
||||||
|
if (header.contains("x-powered-by: statamic")) {
|
||||||
|
return DocumentGenerator.of("laravel", "statamic");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// These should be all the way down as they are the most generic
|
||||||
|
for (var header : headers) {
|
||||||
|
if (header.contains("server: mastodon")) {
|
||||||
|
return DocumentGenerator.of("mastodon");
|
||||||
|
}
|
||||||
|
if (header.contains("server: gunicorn")) {
|
||||||
|
return DocumentGenerator.of("gunicorn");
|
||||||
|
}
|
||||||
|
if (header.contains("server: nginx")) {
|
||||||
|
return DocumentGenerator.of("nginx");
|
||||||
|
}
|
||||||
|
if (header.contains("server: apache")) {
|
||||||
|
return DocumentGenerator.of("apache");
|
||||||
|
}
|
||||||
|
if (header.contains("server: cowboy")) {
|
||||||
|
return DocumentGenerator.of("cowboy"); // erlang, really?!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return DocumentGenerator.unset();
|
return DocumentGenerator.unset();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -138,6 +208,11 @@ public class DocumentGeneratorExtractor {
|
|||||||
generator = generator.substring(0, dashIdx);
|
generator = generator.substring(0, dashIdx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int parenIdx = generator.indexOf('('); // Some strings have values like 'Drupal 9 (https://www.drupal.org)'
|
||||||
|
if (parenIdx >= 0) {
|
||||||
|
generator = generator.substring(0, parenIdx);
|
||||||
|
}
|
||||||
|
|
||||||
if (!StringUtils.isAsciiPrintable(generator))
|
if (!StringUtils.isAsciiPrintable(generator))
|
||||||
return "";
|
return "";
|
||||||
|
|
||||||
@ -170,11 +245,18 @@ public class DocumentGeneratorExtractor {
|
|||||||
final GeneratorType type = switch (parts[0]) {
|
final GeneratorType type = switch (parts[0]) {
|
||||||
case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity",
|
case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity",
|
||||||
"modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms",
|
"modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms",
|
||||||
"typo3", "dotnetnuke", "cms", "coremedia", "dspace"
|
"typo3", "dotnetnuke", "cms", "coremedia", "dspace", "laravel", "trac", "bunnypress", "astro",
|
||||||
|
"ghost", "publii"
|
||||||
-> GeneratorType.CMS;
|
-> GeneratorType.CMS;
|
||||||
case "wix.com", "one.com", "wpbakery", "claris", "wordpress.com", "hubspot",
|
case "wix.com", "one.com", "wpbakery", "claris", "wordpress.com", "hubspot",
|
||||||
"visual_composer", "mobirise", "everweb", "rapidweaver", "shorthand",
|
"visual_composer", "mobirise", "everweb", "rapidweaver", "shorthand",
|
||||||
"visual", "nitropack",
|
"visual", "nitropack", "squarespace", "editmysite", "websiteeditor.net",
|
||||||
|
|
||||||
|
"svbtle.com", "write.as", "montaigne.io", // blogging platforms, maybe should be in another category?
|
||||||
|
|
||||||
|
"cloversites", // clover is a church-oriented website builder, found that kinda neat
|
||||||
|
"bndzgl", // band websites ..?
|
||||||
|
|
||||||
/* these are not SAAS but close enough */
|
/* these are not SAAS but close enough */
|
||||||
"redux", "bootply"
|
"redux", "bootply"
|
||||||
-> GeneratorType.SAAS;
|
-> GeneratorType.SAAS;
|
||||||
@ -185,7 +267,8 @@ public class DocumentGeneratorExtractor {
|
|||||||
"pdf2htmlex", "nvu", "mozilla", "golive", "tenfingers", "publisher",
|
"pdf2htmlex", "nvu", "mozilla", "golive", "tenfingers", "publisher",
|
||||||
"allaire", "neooffice"
|
"allaire", "neooffice"
|
||||||
-> GeneratorType.BOOMER_STATIC;
|
-> GeneratorType.BOOMER_STATIC;
|
||||||
case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome"
|
case "hugo", "jekyll", "hakyll", "nikola", "zola", "olivetti", "pelican", "sushy", "hexo", "eleventy",
|
||||||
|
"gridsome", "vuepress", "docusaurus", "docpad", "techou", "quarto", "soupault"
|
||||||
-> GeneratorType.ZOOMER_STATIC;
|
-> GeneratorType.ZOOMER_STATIC;
|
||||||
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano",
|
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano",
|
||||||
"notepad.exe", "gedit", "me",
|
"notepad.exe", "gedit", "me",
|
||||||
@ -198,9 +281,9 @@ public class DocumentGeneratorExtractor {
|
|||||||
-> GeneratorType.FORUM;
|
-> GeneratorType.FORUM;
|
||||||
case "mediawiki", "dokuwiki", "wikidot", "sharepoint"
|
case "mediawiki", "dokuwiki", "wikidot", "sharepoint"
|
||||||
-> GeneratorType.WIKI;
|
-> GeneratorType.WIKI;
|
||||||
case "pandoc", "mkdocs", "doxygen", "javadoc"
|
case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc"
|
||||||
-> GeneratorType.DOCS;
|
-> GeneratorType.DOCS;
|
||||||
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic"
|
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass"
|
||||||
-> GeneratorType.ECOMMERCE_AND_SPAM;
|
-> GeneratorType.ECOMMERCE_AND_SPAM;
|
||||||
default
|
default
|
||||||
-> GeneratorType.UNKNOWN;
|
-> GeneratorType.UNKNOWN;
|
||||||
@ -216,7 +299,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
public static DocumentGenerator multiple() {
|
public static DocumentGenerator multiple() {
|
||||||
// It's *generally* WordPress or the like that injects multiple generator tags
|
// It's *generally* WordPress or the like that injects multiple generator tags
|
||||||
|
|
||||||
return new DocumentGenerator(GeneratorType.CMS, List.of(defaultValue));
|
return new DocumentGenerator(GeneratorType.CMS, List.of("wordpress", "wp-best-guess"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,14 +19,13 @@ public class DocumentValuator {
|
|||||||
int textLength) throws DisqualifiedException {
|
int textLength) throws DisqualifiedException {
|
||||||
double scriptPenalty = getScriptPenalty(parsedDocument);
|
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||||
|
|
||||||
int textBodyLength = textLength;
|
|
||||||
int rawLength = crawledDocument.documentBody.length();
|
int rawLength = crawledDocument.documentBody.length();
|
||||||
|
|
||||||
if (textBodyLength == 0) {
|
if (textLength == 0) {
|
||||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
|
return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
|
||||||
+ htmlStandard.offset
|
+ htmlStandard.offset
|
||||||
- scriptPenalty;
|
- scriptPenalty;
|
||||||
}
|
}
|
||||||
|
@ -21,25 +21,29 @@ import java.util.Set;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class FeatureExtractor {
|
public class FeatureExtractor {
|
||||||
|
|
||||||
private static final List<String> trackers = List.of("adform.net",
|
private static final List<String> innocentTrackers = List.of(
|
||||||
|
"twitter.com",
|
||||||
|
"bing.com",
|
||||||
|
"msn.com");
|
||||||
|
private static final List<String> shittyTrackers = List.of("adform.net",
|
||||||
"connect.facebook",
|
"connect.facebook",
|
||||||
|
"facebook.com/tr",
|
||||||
"googletagmanager.com",
|
"googletagmanager.com",
|
||||||
"googlesyndication.com",
|
"googlesyndication.com",
|
||||||
"google.com",
|
|
||||||
"twitter.com",
|
|
||||||
"smartadserver.com",
|
"smartadserver.com",
|
||||||
"doubleclick.com",
|
"doubleclick.com",
|
||||||
"2mdn.com",
|
"2mdn.com",
|
||||||
"dmtry.com",
|
"dmtry.com",
|
||||||
"bing.com",
|
|
||||||
"msn.com",
|
|
||||||
"amazon-adsystem.com",
|
"amazon-adsystem.com",
|
||||||
"alexametrics.com",
|
"alexametrics.com",
|
||||||
"rubiconproject.com",
|
"rubiconproject.com",
|
||||||
"chango.com",
|
"chango.com",
|
||||||
"d5nxst8fruw4z.cloudfront.net",
|
"d5nxst8fruw4z.cloudfront.net",
|
||||||
"d31qbv1cthcecs.cloudfront.net",
|
"d31qbv1cthcecs.cloudfront.net",
|
||||||
"linkedin.com");
|
"linkedin.com",
|
||||||
|
"perfectaudience.com",
|
||||||
|
"marketingautomation.services",
|
||||||
|
"usefathom");
|
||||||
|
|
||||||
private final AdblockSimulator adblockSimulator;
|
private final AdblockSimulator adblockSimulator;
|
||||||
private final RecipeDetector recipeDetector;
|
private final RecipeDetector recipeDetector;
|
||||||
@ -71,21 +75,119 @@ public class FeatureExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (var scriptTag : scriptTags) {
|
for (var scriptTag : scriptTags) {
|
||||||
if (isJavascriptTag(scriptTag)) {
|
final String type = scriptTag.attr("type");
|
||||||
|
|
||||||
|
if ("application/ld+json".equalsIgnoreCase(type)) {
|
||||||
|
features.add(HtmlFeature.JSON_LD);
|
||||||
|
}
|
||||||
|
else {
|
||||||
features.add(HtmlFeature.JS);
|
features.add(HtmlFeature.JS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!doc.head().getElementsByTag("viewport").isEmpty()) {
|
||||||
|
features.add(HtmlFeature.VIEWPORT);
|
||||||
|
}
|
||||||
|
for (var atag : doc.body().getElementsByTag("a")) {
|
||||||
|
var rel = atag.attr("rel");
|
||||||
|
if (rel.equals("dofollow")) {
|
||||||
|
features.add(HtmlFeature.DOFOLLOW_LINK);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!doc.getElementsByTag("date").isEmpty()) {
|
||||||
|
features.add(HtmlFeature.DATE_TAG);
|
||||||
|
}
|
||||||
|
if (!doc.getElementsByTag("noscript").isEmpty()) {
|
||||||
|
features.add(HtmlFeature.NOSCRIPT_TAG);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (var link : doc.head().getElementsByTag("link")) {
|
||||||
|
|
||||||
// 500 IQ web developers use <link> error or load handlers
|
// 500 IQ web developers use <link> error or load handlers
|
||||||
// sneakily load JS without explicit script tags
|
// sneakily load JS without explicit script tags
|
||||||
for (var link : doc.head().getElementsByTag("link")) {
|
if (link.hasAttr("onerror"))
|
||||||
if (link.hasAttr("onerror")) {
|
|
||||||
features.add(HtmlFeature.JS);
|
features.add(HtmlFeature.JS);
|
||||||
break;
|
if (link.hasAttr("onload"))
|
||||||
|
features.add(HtmlFeature.JS);
|
||||||
|
|
||||||
|
if (link.hasAttr("pingback")) {
|
||||||
|
features.add(HtmlFeature.PINGBACK);
|
||||||
}
|
}
|
||||||
if (link.hasAttr("onload")) {
|
|
||||||
features.add(HtmlFeature.JS);
|
|
||||||
break;
|
var href = link.attr("href");
|
||||||
|
|
||||||
|
if (href.contains("indieauth"))
|
||||||
|
features.add(HtmlFeature.INDIEAUTH);
|
||||||
|
|
||||||
|
var rel = link.attr("rel");
|
||||||
|
|
||||||
|
if (rel.equals("webmention"))
|
||||||
|
features.add(HtmlFeature.WEBMENTION);
|
||||||
|
|
||||||
|
if (rel.equals("me"))
|
||||||
|
features.add(HtmlFeature.ME_TAG);
|
||||||
|
|
||||||
|
if (rel.equals("next"))
|
||||||
|
features.add(HtmlFeature.NEXT_TAG);
|
||||||
|
|
||||||
|
if (rel.equals("alternate") && link.hasAttr("type"))
|
||||||
|
features.add(HtmlFeature.FEED);
|
||||||
|
|
||||||
|
if (rel.equals("dns-prefetch"))
|
||||||
|
features.add(HtmlFeature.DNS_PREFETCH);
|
||||||
|
|
||||||
|
if (rel.equals("preload"))
|
||||||
|
features.add(HtmlFeature.PRELOAD);
|
||||||
|
|
||||||
|
if (rel.equals("preconnect"))
|
||||||
|
features.add(HtmlFeature.PRECONNECT);
|
||||||
|
|
||||||
|
if (rel.equals("amphtml"))
|
||||||
|
features.add(HtmlFeature.AMPHTML);
|
||||||
|
|
||||||
|
if (rel.equals("apple-touch-icon"))
|
||||||
|
features.add(HtmlFeature.APPLE_TOUCH_ICON);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var meta : doc.head().getElementsByTag("meta")) {
|
||||||
|
// <meta name="robots" content="index,follow">
|
||||||
|
if (meta.attr("name").equals("robots")) {
|
||||||
|
var content = meta.attr("content");
|
||||||
|
if (!content.contains("noindex") && content.contains("index")) {
|
||||||
|
features.add(HtmlFeature.ROBOTS_INDEX);
|
||||||
|
}
|
||||||
|
if (!content.contains("nofollow") && content.contains("follow")) {
|
||||||
|
features.add(HtmlFeature.ROBOTS_FOLLOW);
|
||||||
|
}
|
||||||
|
if (content.contains("noodp")) {
|
||||||
|
features.add(HtmlFeature.ROBOTS_NOODP);
|
||||||
|
}
|
||||||
|
if (content.contains("noydir")) {
|
||||||
|
features.add(HtmlFeature.ROBOTS_NOYDIR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (meta.attr("profile").contains("gmpg")) {
|
||||||
|
features.add(HtmlFeature.PROFILE_GMPG);
|
||||||
|
}
|
||||||
|
if (meta.attr("property").equals("og:description")) {
|
||||||
|
features.add(HtmlFeature.OPENGRAPH);
|
||||||
|
}
|
||||||
|
if (meta.attr("property").equals("og:image")) {
|
||||||
|
features.add(HtmlFeature.OPENGRAPH_IMAGE);
|
||||||
|
}
|
||||||
|
if (meta.attr("name").equals("twitter:description")) {
|
||||||
|
features.add(HtmlFeature.TWITTERCARD);
|
||||||
|
}
|
||||||
|
if (meta.attr("name").equals("twitter:image")) {
|
||||||
|
features.add(HtmlFeature.TWITTERCARD_IMAGE);
|
||||||
|
}
|
||||||
|
if (meta.attr("http-equiv").equals("origin-trial")) {
|
||||||
|
features.add(HtmlFeature.ORIGIN_TRIAL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -100,14 +202,74 @@ public class FeatureExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (var scriptTag : scriptTags) {
|
for (var scriptTag : scriptTags) {
|
||||||
if (hasTrackingScript(scriptTag)) {
|
if (hasInvasiveTrackingScript(scriptTag)) {
|
||||||
features.add(HtmlFeature.TRACKING);
|
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||||
break;
|
features.add(HtmlFeature.TRACKING_EVIL);
|
||||||
|
}
|
||||||
|
else if (hasNaiveTrackingScript(scriptTag)) {
|
||||||
|
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (scriptTag.hasAttr("didomi/javascript")) {
|
||||||
|
features.add(HtmlFeature.DIDOMI);
|
||||||
|
}
|
||||||
|
|
||||||
|
String src = scriptTag.attr("src");
|
||||||
|
if (src.contains("OneSignalSDK")) {
|
||||||
|
features.add(HtmlFeature.ONESIGNAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
String scriptText = scriptTag.html();
|
||||||
|
|
||||||
|
if (scriptText.contains("'pd.js'")) {
|
||||||
|
features.add(HtmlFeature.PARDOT);
|
||||||
|
}
|
||||||
|
if (scriptText.contains("https://cmp.quantcast.com")) {
|
||||||
|
features.add(HtmlFeature.QUANTCAST);
|
||||||
|
}
|
||||||
|
if (scriptText.contains("https://quantcast.mgr.consensu.org")) {
|
||||||
|
features.add(HtmlFeature.QUANTCAST);
|
||||||
|
}
|
||||||
|
if (scriptText.contains("https://cdn.cookielaw.org")) {
|
||||||
|
features.add(HtmlFeature.COOKIELAW);
|
||||||
|
}
|
||||||
|
if (scriptText.contains("_linkedin_data_partner_id")) {
|
||||||
|
features.add(HtmlFeature.TRACKING_EVIL);
|
||||||
|
}
|
||||||
|
if (scriptText.contains("window.OneSignal")) {
|
||||||
|
features.add(HtmlFeature.ONESIGNAL);
|
||||||
|
}
|
||||||
|
if (scriptText.contains("connect.facebook.net")) {
|
||||||
|
features.add(HtmlFeature.TRACKING_EVIL);
|
||||||
|
}
|
||||||
|
if (scriptText.contains("hotjar.com")) {
|
||||||
|
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var noscript : doc.getElementsByTag("noscript")) {
|
||||||
|
for (var iframe : noscript.getElementsByTag("iframe")) {
|
||||||
|
if (hasInvasiveTrackingScript(iframe)) {
|
||||||
|
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||||
|
features.add(HtmlFeature.TRACKING_EVIL);
|
||||||
|
}
|
||||||
|
else if (hasNaiveTrackingScript(iframe)) {
|
||||||
|
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (var img : noscript.getElementsByTag("img")) {
|
||||||
|
if (hasInvasiveTrackingScript(img)) {
|
||||||
|
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||||
|
features.add(HtmlFeature.TRACKING_EVIL);
|
||||||
|
}
|
||||||
|
else if (hasNaiveTrackingScript(img)) {
|
||||||
|
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (scriptTags.html().contains("google-analytics.com")) {
|
if (scriptTags.html().contains("google-analytics.com")) {
|
||||||
features.add(HtmlFeature.TRACKING);
|
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var aTag : doc.getElementsByTag("a")) {
|
for (var aTag : doc.getElementsByTag("a")) {
|
||||||
@ -129,29 +291,32 @@ public class FeatureExtractor {
|
|||||||
return features;
|
return features;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasTrackingScript(Element scriptTag) {
|
private boolean hasInvasiveTrackingScript(Element scriptTag) {
|
||||||
return hasTrackingScript(scriptTag.attr("src"));
|
return hasInvasiveTrackingScript(scriptTag.attr("src"));
|
||||||
}
|
}
|
||||||
|
private boolean hasNaiveTrackingScript(Element scriptTag) {
|
||||||
|
return hasNaiveTrackingScript(scriptTag.attr("src"));
|
||||||
|
}
|
||||||
|
private boolean hasInvasiveTrackingScript(String src) {
|
||||||
|
|
||||||
private boolean hasTrackingScript(String scriptText) {
|
for (var tracker : shittyTrackers) {
|
||||||
|
if (src.contains(tracker)) {
|
||||||
for (var tracker : trackers) {
|
|
||||||
if (scriptText.contains(tracker)) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isJavascriptTag(Element scriptTag) {
|
private boolean hasNaiveTrackingScript(String src) {
|
||||||
final String type = scriptTag.attr("type");
|
|
||||||
|
|
||||||
if ("application/ld+json".equalsIgnoreCase(type)) {
|
for (var tracker : innocentTrackers) {
|
||||||
|
if (src.contains(tracker)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isAmazonAffiliateLink(Element aTag) {
|
boolean isAmazonAffiliateLink(Element aTag) {
|
||||||
final String href = aTag.attr("href").toLowerCase();
|
final String href = aTag.attr("href").toLowerCase();
|
||||||
|
@ -111,9 +111,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||||
|
|
||||||
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, crawledDocument.headers);
|
||||||
|
|
||||||
final var specialization = htmlProcessorSpecializations.select(generatorParts);
|
final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
|
||||||
|
|
||||||
if (!specialization.shouldIndex(url)) {
|
if (!specialization.shouldIndex(url)) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||||
@ -167,7 +167,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
.addGenerator(generatorParts.keywords())
|
.addGenerator(generatorParts.keywords())
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
|
|
||||||
words.addAllSyntheticTerms(tagWords);
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
specialization.amendWords(doc, words);
|
||||||
|
|
||||||
getLinks(url, ret, doc, words);
|
getLinks(url, ret, doc, words);
|
||||||
|
|
||||||
@ -216,8 +218,23 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Annoying wordpress crap
|
// Annoying blog crap
|
||||||
if (url.path.startsWith("/tag/") && url.path.endsWith("/")) {
|
if (url.path.contains("/tag/") && url.path.endsWith("/")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (url.path.contains("/tags/") && url.path.endsWith("/")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (url.path.contains("/category/") && url.path.endsWith("/")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (url.path.contains("/categories/") && url.path.endsWith("/")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (url.path.contains("/section/") && url.path.endsWith("/")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (url.path.contains("/sections/") && url.path.endsWith("/")) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -0,0 +1,210 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
|
import org.apache.logging.log4j.util.Strings;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.nodes.Node;
|
||||||
|
import org.jsoup.select.NodeFilter;
|
||||||
|
import org.jsoup.select.NodeVisitor;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
/** The blog specialization is used for blogs, and makes heavy assumptions about the nature of the document
|
||||||
|
* that aren't generally true, but if the categorization is correct, will yield much better results.
|
||||||
|
*/
|
||||||
|
@Singleton
|
||||||
|
public class BlogSpecialization extends DefaultSpecialization {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public BlogSpecialization(SummaryExtractor summaryExtractor) {
|
||||||
|
super(summaryExtractor);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Document prune(Document original) {
|
||||||
|
var doc = original.clone();
|
||||||
|
|
||||||
|
// Remove all nav junk, comments and other stuff
|
||||||
|
doc.filter(new BlogPruningFilter());
|
||||||
|
|
||||||
|
// If there is an article tag, use that as the root
|
||||||
|
var articleTags = doc.getElementsByTag("article");
|
||||||
|
var firstArticle = articleTags.first();
|
||||||
|
if (firstArticle != null) {
|
||||||
|
var art = firstArticle.clone();
|
||||||
|
|
||||||
|
doc.body().empty();
|
||||||
|
doc.body().appendChild(art);
|
||||||
|
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the default pruning as a fallback
|
||||||
|
return super.prune(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getSummary(Document original, Set<String> importantWords) {
|
||||||
|
return super.getSummary(original, importantWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static List<String> badPathElements =
|
||||||
|
List.of("/tag/", "/tags/", "/tagged/", "/category/", "/categories/", "/section/", "/sections/", "/page/", "/author/");
|
||||||
|
|
||||||
|
private final static Predicate<String> dateIndexTest1 = Pattern.compile("^/(\\d{4}/(\\d{2}/){0,2}?)$").asMatchPredicate();
|
||||||
|
private final static Predicate<String> dateIndexTest2 = Pattern.compile("^/(\\d{2}/){1,2}$").asMatchPredicate();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean shouldIndex(EdgeUrl url) {
|
||||||
|
String path = url.path;
|
||||||
|
|
||||||
|
// Don't index the root path for blogs, as it is usually an ephemeral list of all posts
|
||||||
|
if ("/".equals(path)) return false;
|
||||||
|
|
||||||
|
// Likewise for the blog's home page
|
||||||
|
if (path.endsWith("/blog/")) return false;
|
||||||
|
if (path.endsWith("/log/")) return false;
|
||||||
|
if (path.endsWith("/weblog/")) return false;
|
||||||
|
if (path.endsWith("/posts/")) return false;
|
||||||
|
if (path.endsWith("/articles/")) return false;
|
||||||
|
|
||||||
|
// Refuse paths that contain any of the bad path elements
|
||||||
|
for (String badPathElement : badPathElements) {
|
||||||
|
if (path.contains(badPathElement)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't want chronological listings
|
||||||
|
if (dateIndexTest1.test(path)) return false;
|
||||||
|
if (dateIndexTest2.test(path)) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static PorterStemmer ps = new PorterStemmer();
|
||||||
|
public void amendWords(Document doc, DocumentKeywordsBuilder words) {
|
||||||
|
var tagExtractor = new BlogTagExtractor();
|
||||||
|
doc.traverse(tagExtractor);
|
||||||
|
|
||||||
|
var tags = tagExtractor.getTags();
|
||||||
|
if (!tags.isEmpty()) {
|
||||||
|
var stemmed = tags.stream().map(ps::stemWord).collect(Collectors.toSet());
|
||||||
|
words.setFlagOnMetadataForWords(WordFlags.Subjects, stemmed);
|
||||||
|
|
||||||
|
Set<String> specialTags = tags.stream().map(s -> "tag:" + s).collect(Collectors.toSet());
|
||||||
|
words.addAllSyntheticTerms(specialTags);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Removes all the non-content elements from the document,
|
||||||
|
* making strong blog-specific assumptions about the nature of
|
||||||
|
* the layout */
|
||||||
|
private static class BlogPruningFilter implements NodeFilter {
|
||||||
|
private static final List<String> badClassElements = Arrays.asList("comment", "reply", "sidebar", "header", "footer", "nav");
|
||||||
|
private static final List<String> badIdElements = Arrays.asList("comments", "header", "footer", "nav");
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FilterResult head(Node node, int depth) {
|
||||||
|
if (node instanceof Element el) {
|
||||||
|
String classes = el.attr("class");
|
||||||
|
String id = el.id();
|
||||||
|
|
||||||
|
for (String badClassElement : badClassElements) {
|
||||||
|
if (classes.contains(badClassElement)) {
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (String badIdElement : badIdElements) {
|
||||||
|
if (id.contains(badIdElement)) {
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Extract tag keywords from the blog post
|
||||||
|
public static class BlogTagExtractor implements NodeVisitor {
|
||||||
|
private final Set<String> tags = new HashSet<>();
|
||||||
|
int lookForTags = -1;
|
||||||
|
|
||||||
|
public Set<String> getTags() {
|
||||||
|
Set<String> tagsClean = tags.stream().map(String::toLowerCase).map(this::cleanTag).filter(Strings::isNotBlank).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
// If there are more than 5 tags, it's probably a global tag listing
|
||||||
|
// and not a post-specific tag listing
|
||||||
|
if (tagsClean.size() > 5)
|
||||||
|
return Set.of();
|
||||||
|
|
||||||
|
return tagsClean;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Pattern splitterPattern = Pattern.compile("\\s+");
|
||||||
|
private final Pattern noisePattern = Pattern.compile("[^a-zA-Z0-9]");
|
||||||
|
|
||||||
|
// This is hideously expensive but blog posts are relatively few and far between
|
||||||
|
private String cleanTag(String tag) {
|
||||||
|
|
||||||
|
String[] parts = splitterPattern.split(tag);
|
||||||
|
|
||||||
|
if (parts.length > 3)
|
||||||
|
return "";
|
||||||
|
|
||||||
|
for (int i = 0; i < parts.length; i++) {
|
||||||
|
if (parts[i].startsWith("#"))
|
||||||
|
parts[i] = parts[i].substring(1);
|
||||||
|
else if (parts[i].startsWith("(") && parts[i].endsWith(")"))
|
||||||
|
parts[i] = "";
|
||||||
|
else
|
||||||
|
parts[i] = noisePattern.matcher(parts[i]).replaceAll("");
|
||||||
|
|
||||||
|
if (parts[i].equals("tags"))
|
||||||
|
parts[i] = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return Arrays.stream(parts).filter(Strings::isNotBlank).collect(Collectors.joining("_"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void head(Node node, int depth) {
|
||||||
|
|
||||||
|
if (!(node instanceof Element el)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lookForTags < 0) {
|
||||||
|
if (el.attr("class").contains("tags")) {
|
||||||
|
lookForTags = depth;
|
||||||
|
}
|
||||||
|
if (el.tagName().equals("a")) {
|
||||||
|
if (el.attr("class").contains("tag")
|
||||||
|
|| el.attr("href").startsWith("/tag/"))
|
||||||
|
tags.add(el.text());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (el.tagName().equals("a")) {
|
||||||
|
tags.add(el.text());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
public void tail(Node node, int depth) {
|
||||||
|
if (depth <= lookForTags) { lookForTags = -1; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,9 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||||
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
@ -10,27 +12,41 @@ import java.util.Set;
|
|||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class HtmlProcessorSpecializations {
|
public class HtmlProcessorSpecializations {
|
||||||
|
private final ConverterDomainTypes domainTypes;
|
||||||
private final LemmySpecialization lemmySpecialization;
|
private final LemmySpecialization lemmySpecialization;
|
||||||
private final XenForoSpecialization xenforoSpecialization;
|
private final XenForoSpecialization xenforoSpecialization;
|
||||||
private final PhpBBSpecialization phpBBSpecialization;
|
private final PhpBBSpecialization phpBBSpecialization;
|
||||||
private final JavadocSpecialization javadocSpecialization;
|
private final JavadocSpecialization javadocSpecialization;
|
||||||
|
private final BlogSpecialization blogSpecialization;
|
||||||
private final DefaultSpecialization defaultSpecialization;
|
private final DefaultSpecialization defaultSpecialization;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization,
|
public HtmlProcessorSpecializations(ConverterDomainTypes domainTypes,
|
||||||
|
LemmySpecialization lemmySpecialization,
|
||||||
XenForoSpecialization xenforoSpecialization,
|
XenForoSpecialization xenforoSpecialization,
|
||||||
PhpBBSpecialization phpBBSpecialization,
|
PhpBBSpecialization phpBBSpecialization,
|
||||||
JavadocSpecialization javadocSpecialization,
|
JavadocSpecialization javadocSpecialization,
|
||||||
|
BlogSpecialization blogSpecialization,
|
||||||
DefaultSpecialization defaultSpecialization) {
|
DefaultSpecialization defaultSpecialization) {
|
||||||
|
this.domainTypes = domainTypes;
|
||||||
this.lemmySpecialization = lemmySpecialization;
|
this.lemmySpecialization = lemmySpecialization;
|
||||||
this.xenforoSpecialization = xenforoSpecialization;
|
this.xenforoSpecialization = xenforoSpecialization;
|
||||||
this.phpBBSpecialization = phpBBSpecialization;
|
this.phpBBSpecialization = phpBBSpecialization;
|
||||||
this.javadocSpecialization = javadocSpecialization;
|
this.javadocSpecialization = javadocSpecialization;
|
||||||
|
this.blogSpecialization = blogSpecialization;
|
||||||
this.defaultSpecialization = defaultSpecialization;
|
this.defaultSpecialization = defaultSpecialization;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
|
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
|
||||||
public HtmlProcessorSpecializationIf select(DocumentGeneratorExtractor.DocumentGenerator generator) {
|
public HtmlProcessorSpecializationIf select(
|
||||||
|
DocumentGeneratorExtractor.DocumentGenerator generator,
|
||||||
|
EdgeUrl url)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (domainTypes.isBlog(url.domain)) {
|
||||||
|
return blogSpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
if (generator.keywords().contains("lemmy")) {
|
if (generator.keywords().contains("lemmy")) {
|
||||||
return lemmySpecialization;
|
return lemmySpecialization;
|
||||||
}
|
}
|
||||||
@ -58,5 +74,8 @@ public class HtmlProcessorSpecializations {
|
|||||||
|
|
||||||
default boolean shouldIndex(EdgeUrl url) { return true; }
|
default boolean shouldIndex(EdgeUrl url) { return true; }
|
||||||
default double lengthModifier() { return 1.0; }
|
default double lengthModifier() { return 1.0; }
|
||||||
|
|
||||||
|
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,8 @@ import com.google.inject.AbstractModule;
|
|||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
public class ConvertingIntegrationTestModule extends AbstractModule {
|
public class ConvertingIntegrationTestModule extends AbstractModule {
|
||||||
public void configure() {
|
public void configure() {
|
||||||
@ -13,5 +15,6 @@ public class ConvertingIntegrationTestModule extends AbstractModule {
|
|||||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||||
|
|
||||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||||
|
bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,17 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class BlogSpecializationTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldIndex() throws Exception {
|
||||||
|
var spec = new BlogSpecialization(null);
|
||||||
|
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/22/")));
|
||||||
|
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/")));
|
||||||
|
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/00/22/")));
|
||||||
|
}
|
||||||
|
}
|
@ -34,7 +34,7 @@ class JavadocSpecializationTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void generatorExtraction() {
|
void generatorExtraction() {
|
||||||
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread));
|
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), "");
|
||||||
|
|
||||||
System.out.println(gen);
|
System.out.println(gen);
|
||||||
}
|
}
|
||||||
|
@ -37,8 +37,8 @@ class LemmySpecializationTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void generatorExtraction() {
|
void generatorExtraction() {
|
||||||
var generatorIndex = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyIndexHtml));
|
var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), "");
|
||||||
var generatorPost = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyPost));
|
var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), "");
|
||||||
|
|
||||||
System.out.println(generatorIndex);
|
System.out.println(generatorIndex);
|
||||||
System.out.println(generatorPost);
|
System.out.println(generatorPost);
|
||||||
|
@ -34,7 +34,7 @@ class XenForoSpecializationTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void generatorExtraction() {
|
void generatorExtraction() {
|
||||||
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread));
|
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), "");
|
||||||
|
|
||||||
System.out.println(gen);
|
System.out.println(gen);
|
||||||
}
|
}
|
||||||
|
@ -20,7 +20,8 @@ public class LinkFilterSelector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (isLemmy(head)) {
|
if (isLemmy(head)) {
|
||||||
return url -> url.path.startsWith("/post/") || url.path.startsWith("/c/");
|
return url -> url.path.startsWith("/post/")
|
||||||
|
|| (url.path.startsWith("/c/") && !url.path.contains("@"));
|
||||||
}
|
}
|
||||||
if (isDiscourse(head)) {
|
if (isDiscourse(head)) {
|
||||||
return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
|
return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
|
||||||
|
@ -211,7 +211,7 @@ public class IndexQueryService {
|
|||||||
return switch (priority) {
|
return switch (priority) {
|
||||||
case BEST -> false;
|
case BEST -> false;
|
||||||
case GOOD -> resultCount > params.fetchSize / 4;
|
case GOOD -> resultCount > params.fetchSize / 4;
|
||||||
case FALLBACK -> resultCount > params.fetchSize / 256;
|
case FALLBACK -> resultCount > params.fetchSize / 8;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ public class UrlDetails {
|
|||||||
|
|
||||||
for (var problem :EnumSet.of(
|
for (var problem :EnumSet.of(
|
||||||
HtmlFeature.JS,
|
HtmlFeature.JS,
|
||||||
HtmlFeature.TRACKING,
|
HtmlFeature.TRACKING_INNOCENT,
|
||||||
HtmlFeature.AFFILIATE_LINK,
|
HtmlFeature.AFFILIATE_LINK,
|
||||||
HtmlFeature.COOKIES,
|
HtmlFeature.COOKIES,
|
||||||
HtmlFeature.ADVERTISEMENT)) {
|
HtmlFeature.ADVERTISEMENT)) {
|
||||||
@ -156,7 +156,7 @@ public class UrlDetails {
|
|||||||
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
|
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
|
||||||
}
|
}
|
||||||
public boolean isTracking() {
|
public boolean isTracking() {
|
||||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
|
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT);
|
||||||
}
|
}
|
||||||
public boolean isAffiliate() {
|
public boolean isAffiliate() {
|
||||||
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
|
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
|
||||||
|
@ -1,18 +1,12 @@
|
|||||||
package nu.marginalia.tools.experiments;
|
package nu.marginalia.tools.experiments;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.converting.model.GeneratorType;
|
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
import nu.marginalia.converting.processor.plugin.specialization.BlogSpecialization;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.tools.Experiment;
|
import nu.marginalia.tools.Experiment;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
public class DebugConverterExperiment extends Experiment {
|
public class DebugConverterExperiment extends Experiment {
|
||||||
|
|
||||||
|
|
||||||
@ -24,56 +18,25 @@ public class DebugConverterExperiment extends Experiment {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<String> seenGenerators = new HashSet<>();
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean process(CrawledDomain domain) {
|
public boolean process(CrawledDomain domain) {
|
||||||
|
|
||||||
if (domain.doc == null) return true;
|
if (domain.doc == null) return true;
|
||||||
|
|
||||||
var dge = new DocumentGeneratorExtractor();
|
|
||||||
|
|
||||||
for (var doc : domain.doc) {
|
for (var doc : domain.doc) {
|
||||||
if (doc.documentBody == null) continue;
|
if (doc.documentBody == null) continue;
|
||||||
|
|
||||||
var parsed = Jsoup.parse(doc.documentBody.decode());
|
var parsed = Jsoup.parse(doc.documentBody.decode());
|
||||||
parsed.getElementsByTag("head").comments()
|
|
||||||
.stream().filter(c -> {
|
|
||||||
String data = c.getData();
|
|
||||||
if (data.contains("<script"))
|
|
||||||
return false;
|
|
||||||
if (data.contains("[if"))
|
|
||||||
return false;
|
|
||||||
if (data.contains("shim"))
|
|
||||||
return false;
|
|
||||||
return data.contains("Generated by") || data.contains("generated by")
|
|
||||||
|| data.contains("Powered by") || data.contains("powered by");
|
|
||||||
}).forEach(System.out::println);
|
|
||||||
|
|
||||||
var generators = dge.generatorCleaned(parsed);
|
var tagExtractor = new BlogSpecialization.BlogTagExtractor();
|
||||||
for (var g : generators.keywords()) {
|
parsed.traverse(tagExtractor);
|
||||||
if (seenGenerators.add(g)) {
|
var tags = tagExtractor.getTags();
|
||||||
System.out.println(g + "->" + generators.type());
|
if (!tags.isEmpty()) {
|
||||||
if (generators.type() == GeneratorType.UNKNOWN) {
|
System.out.println(tags);
|
||||||
System.out.println(parsed.select("meta[name=generator]")
|
|
||||||
.attr("content"));
|
|
||||||
System.out.println(doc.url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// var ret = domainProcessor.process(domain);
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// ret.documents.stream()
|
|
||||||
// .filter(ProcessedDocument::isProcessedFully)
|
|
||||||
// .peek(d -> System.out.println(d.url))
|
|
||||||
// .map(d -> d.details.metadata)
|
|
||||||
// .forEach(System.out::println);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user