Additional code restructuring to get rid of util and misc-style packages.

This commit is contained in:
Viktor Lofgren 2023-03-11 13:48:40 +01:00
parent 73e412ea5b
commit 6d939175b1
147 changed files with 715 additions and 600 deletions

View File

@ -1,8 +1,8 @@
package nu.marginalia.index.client.model.results; package nu.marginalia.index.client.model.results;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import java.util.Objects; import java.util.Objects;
@ -26,7 +26,7 @@ public final class SearchResultKeywordScore {
this.hasPriorityTerms = hasPriorityTerms; this.hasPriorityTerms = hasPriorityTerms;
} }
private boolean hasTermFlag(EdgePageWordFlags flag) { private boolean hasTermFlag(WordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
} }
@ -37,7 +37,7 @@ public final class SearchResultKeywordScore {
sum += DocumentMetadata.decodeTopology(encodedDocMetadata); sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) { if (DocumentMetadata.hasFlags(encodedDocMetadata, DocumentFlags.Simple.asBit())) {
sum += 20; sum += 20;
} }
@ -53,28 +53,28 @@ public final class SearchResultKeywordScore {
public double termValue() { public double termValue() {
double sum = 0; double sum = 0;
if (hasTermFlag(EdgePageWordFlags.Title)) { if (hasTermFlag(WordFlags.Title)) {
sum -= 15; sum -= 15;
} }
if (hasTermFlag(EdgePageWordFlags.Site)) { if (hasTermFlag(WordFlags.Site)) {
sum -= 10; sum -= 10;
} else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) { } else if (hasTermFlag(WordFlags.SiteAdjacent)) {
sum -= 5; sum -= 5;
} }
if (hasTermFlag(EdgePageWordFlags.Subjects)) { if (hasTermFlag(WordFlags.Subjects)) {
sum -= 10; sum -= 10;
} }
if (hasTermFlag(EdgePageWordFlags.NamesWords)) { if (hasTermFlag(WordFlags.NamesWords)) {
sum -= 1; sum -= 1;
} }
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) { if (hasTermFlag(WordFlags.UrlDomain)) {
sum -= 5; sum -= 5;
} }
if (hasTermFlag(EdgePageWordFlags.UrlPath)) { if (hasTermFlag(WordFlags.UrlPath)) {
sum -= 5; sum -= 5;
} }
@ -95,12 +95,12 @@ public final class SearchResultKeywordScore {
} }
public boolean isKeywordSpecial() { public boolean isKeywordSpecial() {
return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic);
} }
public boolean isKeywordRegular() { public boolean isKeywordRegular() {
return !keyword.contains(":") return !keyword.contains(":")
&& !hasTermFlag(EdgePageWordFlags.Synthetic); && !hasTermFlag(WordFlags.Synthetic);
} }
public long encodedWordMetadata() { public long encodedWordMetadata() {

View File

@ -14,7 +14,6 @@ java {
dependencies { dependencies {
implementation project(':code:common:service-discovery') implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client') implementation project(':code:common:service-client')
implementation project(':code:libraries:misc')
} }
test { test {

View File

@ -14,6 +14,7 @@ dependencies {
implementation project(':code:common:service-discovery') implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client') implementation project(':code:common:service-client')
implementation project(':code:libraries:big-string') implementation project(':code:libraries:big-string')
implementation project(':code:libraries:braille-block-punch-cards')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok

View File

@ -1,6 +1,6 @@
package nu.marginalia.model.crawl; package nu.marginalia.model.crawl;
public enum EdgeDomainIndexingState { public enum DomainIndexingState {
ACTIVE("Active"), ACTIVE("Active"),
EXHAUSTED("Fully Crawled"), EXHAUSTED("Fully Crawled"),
SPECIAL("Content is side-loaded"), SPECIAL("Content is side-loaded"),
@ -12,7 +12,7 @@ public enum EdgeDomainIndexingState {
public String desc; public String desc;
EdgeDomainIndexingState(String desc) { DomainIndexingState(String desc) {
this.desc = desc; this.desc = desc;
} }
} }

View File

@ -1,15 +0,0 @@
package nu.marginalia.model.crawl;
import lombok.*;
@AllArgsConstructor
@EqualsAndHashCode
@Getter
@Setter
@Builder
@ToString
public class EdgeContentType {
public final String contentType;
public final String charset;
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.model.crawl; package nu.marginalia.model.crawl;
/** This should correspond to EC_URL.STATE */ /** This should correspond to EC_URL.STATE */
public enum EdgeUrlState { public enum UrlIndexingState {
OK, OK,
REDIRECT, REDIRECT,
DEAD, DEAD,

View File

@ -5,8 +5,8 @@ import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId; import nu.marginalia.model.id.EdgeId;
@ImplementedBy(EdgeDomainBlacklistImpl.class) @ImplementedBy(DomainBlacklistImpl.class)
public interface EdgeDomainBlacklist { public interface DomainBlacklist {
boolean isBlacklisted(int domainId); boolean isBlacklisted(int domainId);
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) { default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
return isBlacklisted(domainId.id()); return isBlacklisted(domainId.id());

View File

@ -6,20 +6,19 @@ import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import io.reactivex.rxjava3.schedulers.Schedulers; import io.reactivex.rxjava3.schedulers.Schedulers;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@Singleton @Singleton
public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist { public class DomainBlacklistImpl implements DomainBlacklist {
private volatile TIntHashSet spamDomainSet = new TIntHashSet(); private volatile TIntHashSet spamDomainSet = new TIntHashSet();
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject @Inject
public EdgeDomainBlacklistImpl(HikariDataSource dataSource) { public DomainBlacklistImpl(HikariDataSource dataSource) {
this.dataSource = dataSource; this.dataSource = dataSource;
Schedulers.io().schedulePeriodicallyDirect(this::updateSpamList, 5, 600, TimeUnit.SECONDS); Schedulers.io().schedulePeriodicallyDirect(this::updateSpamList, 5, 600, TimeUnit.SECONDS);

View File

@ -1,8 +1,8 @@
package nu.marginalia.model.crawl; package nu.marginalia.model.idx;
import java.util.EnumSet; import java.util.EnumSet;
public enum EdgePageDocumentFlags { public enum DocumentFlags {
/** Simple processing was done, this document should be de-prioritized as a search result */ /** Simple processing was done, this document should be de-prioritized as a search result */
Simple, Simple,
@ -23,10 +23,10 @@ public enum EdgePageDocumentFlags {
return (asBit() & value) > 0; return (asBit() & value) > 0;
} }
public static EnumSet<EdgePageDocumentFlags> decode(long encodedValue) { public static EnumSet<DocumentFlags> decode(long encodedValue) {
EnumSet<EdgePageDocumentFlags> ret = EnumSet.noneOf(EdgePageDocumentFlags.class); EnumSet<DocumentFlags> ret = EnumSet.noneOf(DocumentFlags.class);
for (EdgePageDocumentFlags f : values()) { for (DocumentFlags f : values()) {
if ((encodedValue & f.asBit()) > 0) { if ((encodedValue & f.asBit()) > 0) {
ret.add(f); ret.add(f);
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.model.idx; package nu.marginalia.model.idx;
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import java.util.EnumSet; import java.util.EnumSet;
@ -44,7 +43,7 @@ public record DocumentMetadata(int rank,
public DocumentMetadata() { public DocumentMetadata() {
this(defaultValue()); this(defaultValue());
} }
public DocumentMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) { public DocumentMetadata(int topology, int year, int sets, int quality, EnumSet<DocumentFlags> flags) {
this(0, 0, topology, year, sets, quality, encodeFlags(flags)); this(0, 0, topology, year, sets, quality, encodeFlags(flags));
} }
@ -58,13 +57,13 @@ public record DocumentMetadata(int rank,
return new DocumentMetadata(rank, encSize, topology, year, sets, quality, flags); return new DocumentMetadata(rank, encSize, topology, year, sets, quality, flags);
} }
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) { private static byte encodeFlags(Set<DocumentFlags> flags) {
byte ret = 0; byte ret = 0;
for (var flag : flags) { ret |= flag.asBit(); } for (var flag : flags) { ret |= flag.asBit(); }
return ret; return ret;
} }
public boolean hasFlag(EdgePageDocumentFlags flag) { public boolean hasFlag(DocumentFlags flag) {
return (flags & flag.asBit()) != 0; return (flags & flag.asBit()) != 0;
} }

View File

@ -1,23 +1,20 @@
package nu.marginalia.model.crawl; package nu.marginalia.model.idx;
import java.util.EnumSet; import java.util.EnumSet;
public enum EdgePageWordFlags { public enum WordFlags {
/** Word appears in title */ /** Word appears in title */
Title, Title,
/** Word appears to be the subject in several sentences /** Word appears to be the subject in several sentences */
* @see SubjectCounter */
Subjects, Subjects,
/** Word has high tf-idf /** Word has high tf-idf */
* @see KeywordCounter */
TfIdfHigh, TfIdfHigh,
/** Word is a likely named object. This is a weaker version of Subjects. /** Word is a likely named object. This is a weaker version of Subjects. */
* @see NameCounter */
NamesWords, NamesWords,
/** The word isn't actually a word on page, but a fake keyword from the code /** The word isn't actually a word on page, but a fake keyword from the code
@ -26,12 +23,10 @@ public enum EdgePageWordFlags {
Synthetic, Synthetic,
/** Word is important to site /** Word is important to site
* @see SiteWords
*/ */
Site, Site,
/** Word is important to adjacent documents /** Word is important to adjacent documents
* @see SiteWords
* */ * */
SiteAdjacent, SiteAdjacent,
@ -54,10 +49,10 @@ public enum EdgePageWordFlags {
return (asBit() & value) > 0; return (asBit() & value) > 0;
} }
public static EnumSet<EdgePageWordFlags> decode(long encodedValue) { public static EnumSet<WordFlags> decode(long encodedValue) {
EnumSet<EdgePageWordFlags> ret = EnumSet.noneOf(EdgePageWordFlags.class); EnumSet<WordFlags> ret = EnumSet.noneOf(WordFlags.class);
for (EdgePageWordFlags f : values()) { for (WordFlags f : values()) {
if ((encodedValue & f.asBit()) > 0) { if ((encodedValue & f.asBit()) > 0) {
ret.add(f); ret.add(f);
} }

View File

@ -1,7 +1,7 @@
package nu.marginalia.model.idx; package nu.marginalia.model.idx;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.util.BrailleBlockPunchCards; import nu.marginalia.bbpc.BrailleBlockPunchCards;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.Set; import java.util.Set;
@ -39,12 +39,12 @@ public record WordMetadata(int tfIdf,
public WordMetadata(int tfIdf, public WordMetadata(int tfIdf,
int positions, int positions,
Set<EdgePageWordFlags> flags) Set<WordFlags> flags)
{ {
this(tfIdf, positions, encodeFlags(flags)); this(tfIdf, positions, encodeFlags(flags));
} }
private static byte encodeFlags(Set<EdgePageWordFlags> flags) { private static byte encodeFlags(Set<WordFlags> flags) {
byte ret = 0; byte ret = 0;
for (var flag : flags) { ret |= flag.asBit(); } for (var flag : flags) { ret |= flag.asBit(); }
return ret; return ret;
@ -64,7 +64,7 @@ public record WordMetadata(int tfIdf,
return (meta >>> TF_IDF_SHIFT) & TF_IDF_MASK; return (meta >>> TF_IDF_SHIFT) & TF_IDF_MASK;
} }
public boolean hasFlag(EdgePageWordFlags flag) { public boolean hasFlag(WordFlags flag) {
return (flags & flag.asBit()) != 0; return (flags & flag.asBit()) != 0;
} }
@ -98,7 +98,7 @@ public record WordMetadata(int tfIdf,
} }
public EnumSet<EdgePageWordFlags> flagSet() { public EnumSet<WordFlags> flagSet() {
return EdgePageWordFlags.decode(flags); return WordFlags.decode(flags);
} }
} }

View File

@ -11,7 +11,6 @@ import java.util.regex.Pattern;
public class QueryParams { public class QueryParams {
private static final Pattern paramSplitterPattern = Pattern.compile("&");
@Nullable @Nullable
public static String queryParamsSanitizer(String path, @Nullable String queryParams) { public static String queryParamsSanitizer(String path, @Nullable String queryParams) {

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.model; package nu.marginalia.model;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -67,7 +67,7 @@ class DocumentMetadataTest {
@Test @Test
public void encRank() { public void encRank() {
var meta = new DocumentMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class)) var meta = new DocumentMetadata(5, 22, 3, 8, EnumSet.noneOf(DocumentFlags.class))
.withSize(0xffffffff).encode(); .withSize(0xffffffff).encode();
var enc2 = DocumentMetadata.encodeRank(meta, 83); var enc2 = DocumentMetadata.encodeRank(meta, 83);

View File

@ -1,6 +1,6 @@
package nu.marginalia.model; package nu.marginalia.model;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -12,16 +12,16 @@ class WordMetadataTest {
@Test @Test
public void codecTest() { public void codecTest() {
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(EdgePageWordFlags.class))); verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(WordFlags.class)));
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(EdgePageWordFlags.class))); verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(WordFlags.class)));
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(EdgePageWordFlags.class))); verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(WordFlags.class)));
System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(EdgePageWordFlags.class))); System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(EdgePageWordFlags.class))); System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(WordFlags.class)));
} }
@Test @Test
public void testClampTfIdfLow() { public void testClampTfIdfLow() {
var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class)); var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(WordFlags.class));
var encoded = new WordMetadata(original.encode()); var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions()); assertEquals(original.positions(), encoded.positions());
@ -30,7 +30,7 @@ class WordMetadataTest {
@Test @Test
public void testClampTfIdfHigh() { public void testClampTfIdfHigh() {
var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class)); var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(WordFlags.class));
var encoded = new WordMetadata(original.encode()); var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions()); assertEquals(original.positions(), encoded.positions());

View File

@ -12,7 +12,6 @@ java {
dependencies { dependencies {
implementation project(':code:common:service-client') implementation project(':code:common:service-client')
implementation project(':code:common:service-discovery') implementation project(':code:common:service-discovery')
implementation project(':code:libraries:misc')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok

View File

@ -1,7 +1,7 @@
package nu.marginalia.model.crawl; package nu.marginalia.crawling.common.model;
public enum EdgeHtmlStandard { public enum HtmlStandard {
PLAIN(0, 1, 1993), PLAIN(0, 1, 1993),
UNKNOWN(0, 1, 2000), UNKNOWN(0, 1, 2000),
HTML123(0, 1, 1997), HTML123(0, 1, 1997),
@ -18,7 +18,7 @@ public enum EdgeHtmlStandard {
* */ * */
public final int yearGuess; public final int yearGuess;
EdgeHtmlStandard(double offset, double scale, int yearGuess) { HtmlStandard(double offset, double scale, int yearGuess) {
this.offset = offset; this.offset = offset;
this.scale = scale; this.scale = scale;
this.yearGuess = yearGuess; this.yearGuess = yearGuess;

View File

@ -18,6 +18,8 @@ dependencies {
implementation project(':code:common:service-client') implementation project(':code:common:service-client')
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')
implementation project(':code:crawl:common')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok
implementation libs.bundles.slf4j implementation libs.bundles.slf4j

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.instruction;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.model.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.DomainLink;
@ -15,7 +15,7 @@ public interface Interpreter {
void loadRssFeed(EdgeUrl[] rssFeed); void loadRssFeed(EdgeUrl[] rssFeed);
void loadDomainLink(DomainLink[] links); void loadDomainLink(DomainLink[] links);
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip); void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip);
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.instruction.instructions; package nu.marginalia.converting.instruction.instructions;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.EdgeUrlState; import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.instruction.InstructionTag; import nu.marginalia.converting.instruction.InstructionTag;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
@ -10,11 +10,11 @@ import org.jetbrains.annotations.Nullable;
public record LoadProcessedDocument(EdgeUrl url, public record LoadProcessedDocument(EdgeUrl url,
EdgeUrlState state, UrlIndexingState state,
String title, String title,
String description, String description,
int htmlFeatures, int htmlFeatures,
EdgeHtmlStandard standard, HtmlStandard standard,
int length, int length,
long hash, long hash,
double quality, double quality,

View File

@ -1,6 +1,6 @@
package nu.marginalia.converting.instruction.instructions; package nu.marginalia.converting.instruction.instructions;
import nu.marginalia.model.crawl.EdgeUrlState; import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.instruction.InstructionTag; import nu.marginalia.converting.instruction.InstructionTag;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
@ -8,7 +8,7 @@ import nu.marginalia.model.EdgeUrl;
public record LoadProcessedDocumentWithError(EdgeUrl url, public record LoadProcessedDocumentWithError(EdgeUrl url,
EdgeUrlState state, UrlIndexingState state,
String reason) implements Instruction String reason) implements Instruction
{ {
@Override @Override

View File

@ -1,12 +1,12 @@
package nu.marginalia.converting.instruction.instructions; package nu.marginalia.converting.instruction.instructions;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.instruction.InstructionTag; import nu.marginalia.converting.instruction.InstructionTag;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction { public record LoadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) implements Instruction {
@Override @Override
public void apply(Interpreter interpreter) { public void apply(Interpreter interpreter) {

View File

@ -3,7 +3,7 @@ package nu.marginalia.converting.model;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter; import lombok.Getter;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.idx.WordFlags;
import java.util.*; import java.util.*;
@ -54,14 +54,14 @@ public class DocumentKeywordsBuilder {
words.putIfAbsent(word, 0); words.putIfAbsent(word, 0);
} }
public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) { public void setFlagOnMetadataForWords(WordFlags flag, Set<String> flagWords) {
flagWords.forEach(word -> flagWords.forEach(word ->
words.mergeLong(word, flag.asBit(), (a, b) -> a|b) words.mergeLong(word, flag.asBit(), (a, b) -> a|b)
); );
} }
public void addAllSyntheticTerms(Collection<String> newWords) { public void addAllSyntheticTerms(Collection<String> newWords) {
long meta = EdgePageWordFlags.Synthetic.asBit(); long meta = WordFlags.Synthetic.asBit();
newWords.forEach(word -> { newWords.forEach(word -> {
words.putIfAbsent(word, meta); words.putIfAbsent(word, meta);

View File

@ -20,20 +20,27 @@ tasks.distZip.enabled = false
dependencies { dependencies {
implementation project(':third-party') implementation project(':third-party')
implementation project(':code:api:index-api')
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:common:service') implementation project(':code:common:service')
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:guarded-regex')
implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:big-string') implementation project(':code:libraries:big-string')
implementation project(':code:api:index-api')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')
implementation project(':code:crawl:common') implementation project(':code:crawl:common')
implementation project(':code:crawl:converting-model') implementation project(':code:crawl:converting-model')
implementation project(':code:crawl:crawling-model') implementation project(':code:crawl:crawling-model')
implementation project(':code:features:adblock')
implementation project(':code:features:pubdate')
implementation project(':code:features:topic-detection')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok
implementation libs.bundles.slf4j implementation libs.bundles.slf4j

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting; package nu.marginalia.converting;
import com.github.luben.zstd.ZstdOutputStream; import com.github.luben.zstd.ZstdOutputStream;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.converting.model.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
@ -49,7 +49,7 @@ public class ConversionLog implements AutoCloseable, Interpreter {
public void loadDomainLink(DomainLink[] links) {} public void loadDomainLink(DomainLink[] links) {}
@Override @Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {} public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {}
@Override @Override
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting;
import com.github.luben.zstd.ZstdOutputStream; import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson; import com.google.gson.Gson;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
@ -106,7 +106,7 @@ public class InstructionWriter {
public void loadDomainLink(DomainLink[] links) {} public void loadDomainLink(DomainLink[] links) {}
@Override @Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) { public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {
this.domainName = domain.toString(); this.domainName = domain.toString();
} }

View File

@ -1,8 +1,8 @@
package nu.marginalia.converting.model; package nu.marginalia.converting.model;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.crawl.EdgeUrlState; import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import java.util.OptionalDouble; import java.util.OptionalDouble;
@ -14,11 +14,11 @@ public class ProcessedDocument {
public ProcessedDocumentDetails details; public ProcessedDocumentDetails details;
public DocumentKeywordsBuilder words; public DocumentKeywordsBuilder words;
public EdgeUrlState state; public UrlIndexingState state;
public String stateReason; public String stateReason;
public boolean isOk() { public boolean isOk() {
return EdgeUrlState.OK == state; return UrlIndexingState.OK == state;
} }
public boolean isProcessedFully() { public boolean isProcessedFully() {
@ -28,7 +28,7 @@ public class ProcessedDocument {
if (details == null) if (details == null)
return false; return false;
return !details.metadata.hasFlag(EdgePageDocumentFlags.Simple); return !details.metadata.hasFlag(DocumentFlags.Simple);
} }
public OptionalDouble quality() { public OptionalDouble quality() {

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.model; package nu.marginalia.converting.model;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
@ -23,7 +23,7 @@ public class ProcessedDocumentDetails {
public long hashCode; public long hashCode;
public Set<HtmlFeature> features; public Set<HtmlFeature> features;
public EdgeHtmlStandard standard; public HtmlStandard standard;
public List<EdgeUrl> linksInternal; public List<EdgeUrl> linksInternal;
public List<EdgeUrl> linksExternal; public List<EdgeUrl> linksExternal;

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.model;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
@ -13,7 +13,7 @@ public class ProcessedDomain {
public EdgeDomain domain; public EdgeDomain domain;
public List<ProcessedDocument> documents; public List<ProcessedDocument> documents;
public EdgeDomainIndexingState state; public DomainIndexingState state;
public EdgeDomain redirect; public EdgeDomain redirect;
public String ip; public String ip;

View File

@ -4,7 +4,7 @@ import com.google.inject.Inject;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.model.crawl.EdgeUrlState; import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin; import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
@ -45,12 +45,12 @@ public class DocumentProcessor {
processDocument(crawledDocument, crawledDomain, ret); processDocument(crawledDocument, crawledDomain, ret);
} }
catch (DisqualifiedException ex) { catch (DisqualifiedException ex) {
ret.state = EdgeUrlState.DISQUALIFIED; ret.state = UrlIndexingState.DISQUALIFIED;
ret.stateReason = ex.reason.toString(); ret.stateReason = ex.reason.toString();
logger.debug("Disqualified {}: {}", ret.url, ex.reason); logger.debug("Disqualified {}: {}", ret.url, ex.reason);
} }
catch (Exception ex) { catch (Exception ex) {
ret.state = EdgeUrlState.DISQUALIFIED; ret.state = UrlIndexingState.DISQUALIFIED;
ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString(); ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString();
logger.info("Failed to convert " + crawledDocument.url, ex); logger.info("Failed to convert " + crawledDocument.url, ex);
ex.printStackTrace(); ex.printStackTrace();
@ -125,11 +125,11 @@ public class DocumentProcessor {
return false; return false;
} }
private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) { private UrlIndexingState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {
return switch (CrawlerDocumentStatus.valueOf(crawlerStatus)) { return switch (CrawlerDocumentStatus.valueOf(crawlerStatus)) {
case OK -> httpStatus < 300 ? EdgeUrlState.OK : EdgeUrlState.DEAD; case OK -> httpStatus < 300 ? UrlIndexingState.OK : UrlIndexingState.DEAD;
case REDIRECT -> EdgeUrlState.REDIRECT; case REDIRECT -> UrlIndexingState.REDIRECT;
default -> EdgeUrlState.DEAD; default -> UrlIndexingState.DEAD;
}; };
} }

View File

@ -6,11 +6,10 @@ import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.crawling.model.CrawlerDomainStatus; import nu.marginalia.crawling.model.CrawlerDomainStatus;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.util.StringPool;
import nu.marginalia.converting.processor.logic.links.InternalLinkGraph; import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
@ -134,12 +133,12 @@ public class DomainProcessor {
} }
} }
private EdgeDomainIndexingState getState(String crawlerStatus) { private DomainIndexingState getState(String crawlerStatus) {
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) { return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
case OK -> EdgeDomainIndexingState.ACTIVE; case OK -> DomainIndexingState.ACTIVE;
case REDIRECT -> EdgeDomainIndexingState.REDIR; case REDIRECT -> DomainIndexingState.REDIR;
case BLOCKED -> EdgeDomainIndexingState.BLOCKED; case BLOCKED -> DomainIndexingState.BLOCKED;
default -> EdgeDomainIndexingState.ERROR; default -> DomainIndexingState.ERROR;
}; };
} }

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.processor; package nu.marginalia.converting.processor;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
@ -24,7 +24,7 @@ public class SiteWords {
Map<EdgeUrl, Set<String>> linkedKeywords = getAdjacentWords(internalLinkGraph); Map<EdgeUrl, Set<String>> linkedKeywords = getAdjacentWords(internalLinkGraph);
for (var doc : processedDomain.documents) { for (var doc : processedDomain.documents) {
applyKeywordsToDoc(doc, EdgePageWordFlags.SiteAdjacent, linkedKeywords.get(doc.url)); applyKeywordsToDoc(doc, WordFlags.SiteAdjacent, linkedKeywords.get(doc.url));
} }
} }
@ -33,17 +33,17 @@ public class SiteWords {
Set<String> commonSiteWords = new HashSet<>(10); Set<String> commonSiteWords = new HashSet<>(10);
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
EdgePageWordFlags.Subjects)); WordFlags.Subjects));
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
EdgePageWordFlags.Title)); WordFlags.Title));
if (commonSiteWords.isEmpty()) { if (commonSiteWords.isEmpty()) {
return; return;
} }
for (var doc : processedDomain.documents) { for (var doc : processedDomain.documents) {
applyKeywordsToDoc(doc, EdgePageWordFlags.Site, commonSiteWords); applyKeywordsToDoc(doc, WordFlags.Site, commonSiteWords);
} }
} }
@ -74,7 +74,7 @@ public class SiteWords {
return linkedKeywords; return linkedKeywords;
} }
private void applyKeywordsToDoc(ProcessedDocument doc, EdgePageWordFlags flag, Set<String> words) { private void applyKeywordsToDoc(ProcessedDocument doc, WordFlags flag, Set<String> words) {
if (doc.words != null && words != null) { if (doc.words != null && words != null) {
doc.words.setFlagOnMetadataForWords(flag, words); doc.words.setFlagOnMetadataForWords(flag, words);
} }

View File

@ -7,7 +7,7 @@ import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata; import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.idx.WordFlags;
import java.util.EnumSet; import java.util.EnumSet;
@ -22,7 +22,7 @@ public class SimpleKeywords {
KeywordMetadata metadata, KeywordMetadata metadata,
DocumentLanguageData documentLanguageData) { DocumentLanguageData documentLanguageData) {
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class); EnumSet<WordFlags> flagsTemplate = EnumSet.noneOf(WordFlags.class);
for (var sent : documentLanguageData.sentences) { for (var sent : documentLanguageData.sentences) {

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.logic;
import crawlercommons.utils.Strings; import crawlercommons.utils.Strings;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -22,7 +22,7 @@ public class DocumentValuator {
); );
public double getQuality(CrawledDocument crawledDocument, EdgeHtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException { public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count(); double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
double scriptPenalty = getScriptPenalty(parsedDocument); double scriptPenalty = getScriptPenalty(parsedDocument);

View File

@ -2,10 +2,14 @@ package nu.marginalia.converting.processor.logic;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.converting.processor.logic.topic.*; import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.processor.logic; package nu.marginalia.converting.processor.logic;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType; import org.jsoup.nodes.DocumentType;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -12,53 +12,53 @@ public class HtmlStandardExtractor {
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class); private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
public static EdgeHtmlStandard parseDocType(DocumentType docType) { public static HtmlStandard parseDocType(DocumentType docType) {
if (null == docType) { if (null == docType) {
return EdgeHtmlStandard.UNKNOWN; return HtmlStandard.UNKNOWN;
} }
String publicId = docType.publicId(); String publicId = docType.publicId();
if (Strings.isNullOrEmpty(publicId)) if (Strings.isNullOrEmpty(publicId))
return EdgeHtmlStandard.HTML5; return HtmlStandard.HTML5;
publicId = publicId.toUpperCase(); publicId = publicId.toUpperCase();
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) { if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
return EdgeHtmlStandard.HTML4; return HtmlStandard.HTML4;
} }
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) { if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
} }
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN")) if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-//NETSCAPE COMM. CORP")) if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-//SQ//DTD HTML 2")) if (publicId.startsWith("-//SQ//DTD HTML 2"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2")) if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-//W3O//DTD W3 HTML 2")) if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-//IETF//DTD HTML 2")) if (publicId.startsWith("-//IETF//DTD HTML 2"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-//IETF//DTD HTML//EN")) if (publicId.startsWith("-//IETF//DTD HTML//EN"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-/W3C//DTD HTML 3")) if (publicId.startsWith("-/W3C//DTD HTML 3"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-/W3C/DTD HTML 3")) if (publicId.startsWith("-/W3C/DTD HTML 3"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-//IETF//DTD HTML 3")) if (publicId.startsWith("-//IETF//DTD HTML 3"))
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
if (publicId.startsWith("-//W3C//DTD XHTML")) if (publicId.startsWith("-//W3C//DTD XHTML"))
return EdgeHtmlStandard.XHTML; return HtmlStandard.XHTML;
if (publicId.startsWith("ISO/IEC 15445:2000//DTD")) if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
return EdgeHtmlStandard.XHTML; return HtmlStandard.XHTML;
if (publicId.startsWith("-//W3C//DTD HTML")) if (publicId.startsWith("-//W3C//DTD HTML"))
return EdgeHtmlStandard.HTML4; return HtmlStandard.HTML4;
logger.debug("Unknown publicID standard {}", publicId); logger.debug("Unknown publicID standard {}", publicId);
return EdgeHtmlStandard.UNKNOWN; return HtmlStandard.UNKNOWN;
} }
public static EdgeHtmlStandard sniffHtmlStandard(Document parsed) { public static HtmlStandard sniffHtmlStandard(Document parsed) {
int html4Attributes = 0; int html4Attributes = 0;
int html5Attributes = 0; int html5Attributes = 0;
@ -72,11 +72,11 @@ public class HtmlStandardExtractor {
html4Attributes++; html4Attributes++;
} }
if (html5Attributes > 0) { if (html5Attributes > 0) {
return EdgeHtmlStandard.HTML5; return HtmlStandard.HTML5;
} }
if (html4Attributes > 0) { if (html4Attributes > 0) {
return EdgeHtmlStandard.HTML4; return HtmlStandard.HTML4;
} }
return EdgeHtmlStandard.HTML123; return HtmlStandard.HTML123;
} }
} }

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.processor.logic; package nu.marginalia.converting.processor.logic;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.model.crawl.EdgeUrlState; import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.lsh.EasyLSH; import nu.marginalia.lsh.EasyLSH;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -52,7 +52,7 @@ public class LshDocumentDeduplicator {
{ {
logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url); logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url);
otherDoc.state = EdgeUrlState.DISQUALIFIED; otherDoc.state = UrlIndexingState.DISQUALIFIED;
otherDoc.stateReason = "Duplicate"; otherDoc.stateReason = "Duplicate";
return true; return true;

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.processor.logic.links; package nu.marginalia.converting.processor.logic.links;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import java.util.*; import java.util.*;
@ -16,7 +16,7 @@ public class CommonKeywordExtractor {
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5; private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
public List<String> getCommonSiteWords(ProcessedDomain ret, EdgePageWordFlags... flags) { public List<String> getCommonSiteWords(ProcessedDomain ret, WordFlags... flags) {
if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS) if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS)
return Collections.emptyList(); return Collections.emptyList();
@ -27,7 +27,7 @@ public class CommonKeywordExtractor {
final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10); final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10);
int qualifiedDocCount = 0; int qualifiedDocCount = 0;
long wordFlags = Arrays.stream(flags).mapToInt(EdgePageWordFlags::asBit).reduce(0, (a,b) -> a|b); long wordFlags = Arrays.stream(flags).mapToInt(WordFlags::asBit).reduce(0, (a, b) -> a|b);
for (var doc : ret.documents) { for (var doc : ret.documents) {
if (doc.words == null) if (doc.words == null)
continue; continue;

View File

@ -1,6 +1,6 @@
package nu.marginalia.converting.processor.logic.links; package nu.marginalia.converting.processor.logic.links;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
@ -22,7 +22,7 @@ public class InternalLinkGraph {
internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal)); internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal));
knownUrls.addAll(doc.details.linksInternal); knownUrls.addAll(doc.details.linksInternal);
List<String> topKeywords = doc.words.getWordsWithAnyFlag(EdgePageWordFlags.TfIdfHigh.asBit() | EdgePageWordFlags.Subjects.asBit()); List<String> topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.TfIdfHigh.asBit() | WordFlags.Subjects.asBit());
topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords)); topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords));
candidateKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords)); candidateKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords));

View File

@ -1,6 +0,0 @@
package nu.marginalia.converting.processor.logic.pubdate;
public enum PubDateEffortLevel {
LOW,
HIGH
}

View File

@ -1,23 +0,0 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
if (htmlStandard == EdgeHtmlStandard.UNKNOWN)
return Optional.empty();
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
}
}

View File

@ -4,7 +4,7 @@ import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.LanguageFilter; import nu.marginalia.language.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.DocumentKeywordsBuilder; import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
@ -56,7 +56,7 @@ public abstract class AbstractDocumentProcessorPlugin {
return this; return this;
} }
public MetaTagsBuilder addFormat(EdgeHtmlStandard standard) { public MetaTagsBuilder addFormat(HtmlStandard standard) {
tagWords.add("format:"+standard.toString().toLowerCase()); tagWords.add("format:"+standard.toString().toLowerCase());
return this; return this;
} }

View File

@ -7,23 +7,22 @@ import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
import nu.marginalia.crawling.common.link.LinkParser; import nu.marginalia.crawling.common.link.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor; import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.converting.model.DocumentKeywordsBuilder; import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.processor.logic.*; import nu.marginalia.converting.processor.logic.*;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateSniffer;
import nu.marginalia.gregex.GuardedRegex; import nu.marginalia.gregex.GuardedRegex;
import nu.marginalia.gregex.GuardedRegexFactory; import nu.marginalia.gregex.GuardedRegexFactory;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.model.ProcessedDocumentDetails;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateSniffer;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -120,7 +119,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.hashCode = dld.localitySensitiveHashCode(); ret.hashCode = dld.localitySensitiveHashCode();
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class)); ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(DocumentFlags.class));
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
@ -262,10 +261,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
words.addAllSyntheticTerms(linkTerms); words.addAllSyntheticTerms(linkTerms);
} }
private EdgeHtmlStandard getHtmlStandard(Document doc) { private HtmlStandard getHtmlStandard(Document doc) {
EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType()); HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
if (EdgeHtmlStandard.UNKNOWN.equals(htmlStandard)) { if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
return HtmlStandardExtractor.sniffHtmlStandard(doc); return HtmlStandardExtractor.sniffHtmlStandard(doc);
} }
return htmlStandard; return htmlStandard;

View File

@ -4,11 +4,10 @@ import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor; import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.converting.model.DocumentKeywordsBuilder; import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
@ -16,7 +15,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.model.ProcessedDocumentDetails;
import nu.marginalia.converting.processor.logic.PlainTextLogic; import nu.marginalia.converting.processor.logic.PlainTextLogic;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.util.LineUtils; import nu.marginalia.converting.util.LineUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.net.URISyntaxException; import java.net.URISyntaxException;
@ -78,7 +77,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40); List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);
ret.length = documentBody.length(); ret.length = documentBody.length();
ret.standard = EdgeHtmlStandard.PLAIN; ret.standard = HtmlStandard.PLAIN;
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength); ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
ret.quality = -1; ret.quality = -1;
@ -89,7 +88,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1)); final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText)); ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(DocumentFlags.PlainText));
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);

View File

@ -1,4 +1,4 @@
package nu.marginalia.util; package nu.marginalia.converting.util;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.logic; package nu.marginalia.converting.logic;
import nu.marginalia.converting.processor.logic.PlainTextLogic; import nu.marginalia.converting.processor.logic.PlainTextLogic;
import nu.marginalia.util.LineUtils; import nu.marginalia.converting.util.LineUtils;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -3,7 +3,6 @@ package nu.marginalia.converting.processor.keywords;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.language.WordPatterns; import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan; import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
@ -12,10 +11,7 @@ import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.WordSeparator; import nu.marginalia.language.model.WordSeparator;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.test.util.TestLanguageModels; import nu.marginalia.test.util.TestLanguageModels;
import org.apache.commons.lang3.tuple.Pair;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Disabled;
@ -27,7 +23,6 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.*; import java.util.*;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.IntStream;
@Tag("slow") @Tag("slow")
class SentenceExtractorTest { class SentenceExtractorTest {

View File

@ -1,4 +1,4 @@
package nu.marginalia.util; package nu.marginalia.converting.util;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -5,9 +5,8 @@ import com.google.common.hash.Hashing;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.crawling.model.CrawlingSpecification; import nu.marginalia.crawling.model.CrawlingSpecification;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
import java.sql.Connection;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
@ -69,11 +68,11 @@ public class CrawlJobDomainExtractor {
"""; """;
private final EdgeDomainBlacklistImpl blacklist; private final DomainBlacklistImpl blacklist;
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private static final HashFunction hasher = Hashing.murmur3_128(0); private static final HashFunction hasher = Hashing.murmur3_128(0);
public CrawlJobDomainExtractor(EdgeDomainBlacklistImpl blacklist, HikariDataSource dataSource) { public CrawlJobDomainExtractor(DomainBlacklistImpl blacklist, HikariDataSource dataSource) {
this.blacklist = blacklist; this.blacklist = blacklist;
this.dataSource = dataSource; this.dataSource = dataSource;
} }

View File

@ -2,7 +2,7 @@ package nu.marginalia.crawl;
import nu.marginalia.crawling.model.CrawlingSpecification; import nu.marginalia.crawling.model.CrawlingSpecification;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.service.module.DatabaseModule;
import java.io.IOException; import java.io.IOException;
@ -37,7 +37,7 @@ public class CrawlJobExtractorMain {
private static Stream<CrawlingSpecification> streamSpecs(String[] targetDomains) { private static Stream<CrawlingSpecification> streamSpecs(String[] targetDomains) {
var ds = new DatabaseModule().provideConnection(); var ds = new DatabaseModule().provideConnection();
var domainExtractor = new CrawlJobDomainExtractor(new EdgeDomainBlacklistImpl(ds), ds); var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds);
if (targetDomains.length > 0) { if (targetDomains.length > 0) {
return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain); return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain);

View File

@ -0,0 +1,5 @@
package nu.marginalia.crawling.model;
public record ContentType(String contentType, String charset) {
}

View File

@ -9,7 +9,7 @@ import lombok.SneakyThrows;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.model.crawl.EdgeContentType; import nu.marginalia.crawling.model.ContentType;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.bigstring.BigString; import nu.marginalia.bigstring.BigString;
@ -257,11 +257,11 @@ public class HttpFetcher {
byte[] data = byteStream.readNBytes(maxFetchSize); byte[] data = byteStream.readNBytes(maxFetchSize);
var contentType = ContentTypeParser.parse(contentTypeHeader, data); var contentType = ContentTypeParser.parse(contentTypeHeader, data);
if (!contentTypeLogic.isAllowableContentType(contentType.contentType)) { if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
} }
if ("Shift_JIS".equalsIgnoreCase(contentType.charset)) { if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, ""); return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, "");
} }
@ -280,10 +280,10 @@ public class HttpFetcher {
.build(); .build();
} }
private String getStringData(byte[] data, EdgeContentType contentType) { private String getStringData(byte[] data, ContentType contentType) {
Charset charset; Charset charset;
try { try {
charset = Charset.forName(contentType.charset); charset = Charset.forName(contentType.charset());
} }
catch (IllegalCharsetNameException ex) { catch (IllegalCharsetNameException ex) {
charset = StandardCharsets.UTF_8; charset = StandardCharsets.UTF_8;

View File

@ -1,7 +1,7 @@
package nu.marginalia.crawl.retreival.logic; package nu.marginalia.crawl.retreival.logic;
import crawlercommons.mimetypes.MimeTypeDetector; import crawlercommons.mimetypes.MimeTypeDetector;
import nu.marginalia.model.crawl.EdgeContentType; import nu.marginalia.crawling.model.ContentType;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import java.util.Arrays; import java.util.Arrays;
@ -11,25 +11,25 @@ public class ContentTypeParser {
static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector(); static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();
public static EdgeContentType parse(String contentType, byte[] data) { public static ContentType parse(String contentType, byte[] data) {
return getContentTypeFromContentTypeString(contentType) return getContentTypeFromContentTypeString(contentType)
.or(() -> getContentTypeStringFromTag(data)) .or(() -> getContentTypeStringFromTag(data))
.orElseGet(() -> { .orElseGet(() -> {
Optional<String> charset = getCharsetFromTag(data); Optional<String> charset = getCharsetFromTag(data);
return new EdgeContentType( return new ContentType(
Optional.ofNullable(contentType) Optional.ofNullable(contentType)
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(data))) .or(() -> Optional.ofNullable(mimeTypeDetector.detect(data)))
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1")); .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1"));
}); });
} }
private static Optional<EdgeContentType> getContentTypeFromContentTypeString(String contentType) { private static Optional<ContentType> getContentTypeFromContentTypeString(String contentType) {
if (contentType != null && contentType.contains(";")) { if (contentType != null && contentType.contains(";")) {
var parts = contentType.split(";"); var parts = contentType.split(";");
var content = parts[0].trim(); var content = parts[0].trim();
var extra = parts[1].trim(); var extra = parts[1].trim();
if (extra.startsWith("charset=")) { if (extra.startsWith("charset=")) {
return Optional.of(new EdgeContentType(content, extra.substring("charset=".length()))); return Optional.of(new ContentType(content, extra.substring("charset=".length())));
} }
} }
return Optional.empty(); return Optional.empty();
@ -53,7 +53,7 @@ public class ContentTypeParser {
} }
private static Optional<EdgeContentType> getContentTypeStringFromTag(byte[] data) { private static Optional<ContentType> getContentTypeStringFromTag(byte[] data) {
String header = new String(Arrays.copyOf(data, Math.min(1024, data.length))); String header = new String(Arrays.copyOf(data, Math.min(1024, data.length)));
var doc = Jsoup.parse(header); var doc = Jsoup.parse(header);
for (var metaTag : doc.getElementsByTag("meta")) { for (var metaTag : doc.getElementsByTag("meta")) {

View File

@ -24,6 +24,8 @@ dependencies {
implementation project(':code:crawl:common') implementation project(':code:crawl:common')
implementation project(':code:crawl:crawling-model') implementation project(':code:crawl:crawling-model')
implementation project(':code:crawl:converting-process') implementation project(':code:crawl:converting-process')
implementation project(':code:features:adblock')
implementation project(':code:features:topic-detection')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok

View File

@ -1,7 +1,7 @@
package nu.marginalia.experimental; package nu.marginalia.experimental;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.DocumentProcessor; import nu.marginalia.converting.processor.DocumentProcessor;
import nu.marginalia.converting.processor.logic.topic.AdblockSimulator;
import nu.marginalia.crawling.common.plan.CrawlPlanLoader; import nu.marginalia.crawling.common.plan.CrawlPlanLoader;
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;

View File

@ -10,10 +10,10 @@ import nu.marginalia.converting.processor.logic.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.processor.logic.topic.GoogleAnwersSpamDetector; import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.logic.topic.RecipeDetector; import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.converting.processor.logic.topic.TextileCraftDetector; import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.converting.processor.logic.topic.WoodworkingDetector; import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -1,8 +1,8 @@
package nu.marginalia.experimental; package nu.marginalia.experimental;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.DocumentProcessor; import nu.marginalia.converting.processor.DocumentProcessor;
import nu.marginalia.converting.processor.logic.topic.AdblockSimulator;
import nu.marginalia.crawling.common.plan.CrawlPlanLoader; import nu.marginalia.crawling.common.plan.CrawlPlanLoader;
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;

View File

@ -28,7 +28,6 @@ dependencies {
implementation project(':code:index:lexicon') implementation project(':code:index:lexicon')
implementation project(':code:index:index-journal') implementation project(':code:index:index-journal')
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:misc')
testImplementation project(':code:services-core:search-service') testImplementation project(':code:services-core:search-service')

View File

@ -2,7 +2,7 @@ package nu.marginalia.loading.loader;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.converting.model.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
@ -76,7 +76,7 @@ public class Loader implements Interpreter {
} }
@Override @Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) { public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {
sqlLoadProcessedDomain.load(data, domain, state, ip); sqlLoadProcessedDomain.load(data, domain, state, ip);
} }

View File

@ -2,7 +2,7 @@ package nu.marginalia.loading.loader;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.DomainLink;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -42,7 +42,7 @@ public class SqlLoadProcessedDomain {
} }
} }
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) { public void load(LoaderData data, EdgeDomain domain, DomainIndexingState state, String ip) {
data.setTargetDomain(domain); data.setTargetDomain(domain);
loadDomains.load(data, domain); loadDomains.load(data, domain);

View File

@ -9,8 +9,8 @@ import nu.marginalia.loading.loader.SqlLoadProcessedDocument;
import nu.marginalia.loading.loader.SqlLoadUrls; import nu.marginalia.loading.loader.SqlLoadUrls;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.EdgeUrlState; import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.id.EdgeIdArray; import nu.marginalia.model.id.EdgeIdArray;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
@ -69,11 +69,11 @@ class SqlLoadProcessedDocumentTest {
loader.load(loaderData, List.of(new LoadProcessedDocument( loader.load(loaderData, List.of(new LoadProcessedDocument(
url, url,
EdgeUrlState.OK, UrlIndexingState.OK,
"TITLE", "TITLE",
"DESCR", "DESCR",
HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)), HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)),
EdgeHtmlStandard.HTML5, HtmlStandard.HTML5,
100, 100,
12345, 12345,
-3.14, -3.14,

View File

@ -6,7 +6,7 @@ import nu.marginalia.loading.loader.SqlLoadDomains;
import nu.marginalia.loading.loader.SqlLoadProcessedDomain; import nu.marginalia.loading.loader.SqlLoadProcessedDomain;
import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.DomainLink;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Tag;
@ -48,7 +48,7 @@ class SqlLoadProcessedDomainTest {
@Test @Test
public void loadProcessedDomain() { public void loadProcessedDomain() {
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1"); loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1");
} }
@Test @Test
public void loadDomainAlias() { public void loadDomainAlias() {

View File

@ -0,0 +1,41 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation project(':code:common:config')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.guice
implementation libs.notnull
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}
task fastTests(type: Test) {
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -0,0 +1,8 @@
# Adblock
Contains an adblock simulator that reads an adblock specifications file and
uses it to identify if a document has ads.
## Central Classes
* [AdblockSimulator](src/main/java/nu/marginalia/adblock/AdblockSimulator.java)

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic.topic; package nu.marginalia.adblock;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic.topic; package nu.marginalia.adblock;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;

View File

@ -2,7 +2,7 @@ package nu.marginalia.ranking.data;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
@Data @Data
@AllArgsConstructor @AllArgsConstructor
@ -10,7 +10,7 @@ public class RankingDomainData {
public final int id; public final int id;
public final String name; public final String name;
private int alias; private int alias;
public EdgeDomainIndexingState state; public DomainIndexingState state;
public final int knownUrls; public final int knownUrls;
public int resolveAlias() { public int resolveAlias() {
@ -23,10 +23,10 @@ public class RankingDomainData {
} }
public boolean isSpecial() { public boolean isSpecial() {
return EdgeDomainIndexingState.SPECIAL == state; return DomainIndexingState.SPECIAL == state;
} }
public boolean isSocialMedia() { public boolean isSocialMedia() {
return EdgeDomainIndexingState.SOCIAL_MEDIA == state; return DomainIndexingState.SOCIAL_MEDIA == state;
} }
} }

View File

@ -3,8 +3,8 @@ package nu.marginalia.ranking.data;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -15,13 +15,13 @@ import java.util.function.IntConsumer;
@Singleton @Singleton
public class RankingDomainFetcher { public class RankingDomainFetcher {
protected final HikariDataSource dataSource; protected final HikariDataSource dataSource;
protected final EdgeDomainBlacklistImpl blacklist; protected final DomainBlacklistImpl blacklist;
protected final Logger logger = LoggerFactory.getLogger(getClass()); protected final Logger logger = LoggerFactory.getLogger(getClass());
protected boolean getNames = false; protected boolean getNames = false;
@Inject @Inject
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { public RankingDomainFetcher(HikariDataSource dataSource, DomainBlacklistImpl blacklist) {
this.dataSource = dataSource; this.dataSource = dataSource;
this.blacklist = blacklist; this.blacklist = blacklist;
} }
@ -66,7 +66,7 @@ public class RankingDomainFetcher {
new RankingDomainData(id, new RankingDomainData(id,
rsp.getString(2), rsp.getString(2),
rsp.getInt(3), rsp.getInt(3),
EdgeDomainIndexingState.valueOf(rsp.getString(4)), DomainIndexingState.valueOf(rsp.getString(4)),
rsp.getInt(5))); rsp.getInt(5)));
} }
} }

View File

@ -3,7 +3,7 @@ package nu.marginalia.ranking.data;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.sql.SQLException; import java.sql.SQLException;
@ -14,7 +14,7 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher
final boolean hasData; final boolean hasData;
@Inject @Inject
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, DomainBlacklistImpl blacklist) {
super(dataSource, blacklist); super(dataSource, blacklist);
hasData = isDomainNeighborTablePopulated(dataSource); hasData = isDomainNeighborTablePopulated(dataSource);

View File

@ -2,7 +2,7 @@ package nu.marginalia.ranking.tool;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
import nu.marginalia.ranking.StandardPageRank; import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
@ -32,7 +32,7 @@ public class CreateBrowseDomainRanksTool {
logger.info("Ranking"); logger.info("Ranking");
var ds = new DatabaseModule().provideConnection(); var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); var domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds));
var rpr = new StandardPageRank(domains, args); var rpr = new StandardPageRank(domains, args);
uploader.start(); uploader.start();

View File

@ -13,7 +13,7 @@ import lombok.SneakyThrows;
import nu.marginalia.ranking.RankingAlgorithm; import nu.marginalia.ranking.RankingAlgorithm;
import nu.marginalia.ranking.data.RankingDomainData; import nu.marginalia.ranking.data.RankingDomainData;
import nu.marginalia.ranking.data.RankingDomainFetcher; import nu.marginalia.ranking.data.RankingDomainFetcher;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.service.module.DatabaseModule;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -49,7 +49,7 @@ public class PerusePageRankV2 {
@SneakyThrows @SneakyThrows
public static void main(String... args) { public static void main(String... args) {
var ds = new DatabaseModule().provideConnection(); var ds = new DatabaseModule().provideConnection();
var blacklist = new EdgeDomainBlacklistImpl(ds); var blacklist = new DomainBlacklistImpl(ds);
var rank = new PerusePageRankV2(new RankingDomainFetcher(ds, blacklist)); var rank = new PerusePageRankV2(new RankingDomainFetcher(ds, blacklist));
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();

View File

@ -3,7 +3,7 @@ package nu.marginalia.ranking.tool;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.ranking.data.RankingDomainFetcher; import nu.marginalia.ranking.data.RankingDomainFetcher;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
import nu.marginalia.ranking.StandardPageRank; import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.service.module.DatabaseModule;
@ -35,11 +35,11 @@ public class PrintDomainRanksTool {
RankingDomainFetcher domains; RankingDomainFetcher domains;
if (Boolean.getBoolean("use-link-data")) { if (Boolean.getBoolean("use-link-data")) {
domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); domains = new RankingDomainFetcher(ds, new DomainBlacklistImpl(ds));
domains.retainNames(); domains.retainNames();
} }
else { else {
domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds));
domains.retainNames(); domains.retainNames();
} }

View File

@ -6,7 +6,7 @@ import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.service.module.DatabaseModule;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -33,7 +33,7 @@ public class UpdateDomainRanksTool {
var uploader = new Thread(() -> uploadThread(conn), "Uploader"); var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking"); logger.info("Ranking");
var domains = new RankingDomainFetcherForSimilarityData(conn, new EdgeDomainBlacklistImpl(conn)); var domains = new RankingDomainFetcherForSimilarityData(conn, new DomainBlacklistImpl(conn));
var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com"); var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com");
rankMax = rpr.size(); rankMax = rpr.size();

View File

@ -0,0 +1,44 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation project(':code:common:model')
implementation project(':code:crawl:common')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.guice
implementation libs.notnull
implementation libs.gson
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:common:config')
}
test {
useJUnitPlatform()
}
task fastTests(type: Test) {
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -0,0 +1,7 @@
# Pubdate
Contains advanced haruspicy for figuring out when a document was published.
## Central Classes
* [PubDateSniffer](src/main/java/nu/marginalia/pubdate/PubDateSniffer.java)

View File

@ -0,0 +1,6 @@
package nu.marginalia.pubdate;
public enum PubDateEffortLevel {
LOW,
HIGH
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.processor.logic.pubdate; package nu.marginalia.pubdate;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -9,5 +9,5 @@ import java.util.Optional;
public interface PubDateHeuristic { public interface PubDateHeuristic {
Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard); Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
} }

View File

@ -1,6 +1,6 @@
package nu.marginalia.converting.processor.logic.pubdate; package nu.marginalia.pubdate;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import java.time.DateTimeException; import java.time.DateTimeException;
@ -122,7 +122,7 @@ public class PubDateParser {
return (max + min) / 2; return (max + min) / 2;
} }
public static int guessYear(EdgeHtmlStandard standard) { public static int guessYear(HtmlStandard standard) {
// Create some jitter to avoid having documents piling up in the same four years // Create some jitter to avoid having documents piling up in the same four years
// as this would make searching in those years disproportionately useless // as this would make searching in those years disproportionately useless

View File

@ -1,9 +1,9 @@
package nu.marginalia.converting.processor.logic.pubdate; package nu.marginalia.pubdate;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.heuristic.*;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.heuristic.*;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.ArrayList; import java.util.ArrayList;
@ -36,7 +36,7 @@ public class PubDateSniffer {
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard()); heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
} }
public PubDate getPubDate(String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard, boolean runExpensive) { public PubDate getPubDate(String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW; final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
for (var heuristic : heuristics) { for (var heuristic : heuristics) {

View File

@ -1,11 +1,11 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
@ -18,7 +18,7 @@ import java.util.Optional;
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
if (effortLevel == PubDateEffortLevel.LOW) if (effortLevel == PubDateEffortLevel.LOW)
return Optional.empty(); return Optional.empty();
@ -32,9 +32,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
private static class DateExtractingNodeVisitorPass implements NodeFilter { private static class DateExtractingNodeVisitorPass implements NodeFilter {
public PubDate pubDate; public PubDate pubDate;
private final EdgeHtmlStandard htmlStandard; private final HtmlStandard htmlStandard;
private DateExtractingNodeVisitorPass(EdgeHtmlStandard htmlStandard) { private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
this.htmlStandard = htmlStandard; this.htmlStandard = htmlStandard;
} }
@ -130,7 +130,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
} }
private void parse(String text) { private void parse(String text) {
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) { if (htmlStandard == HtmlStandard.UNKNOWN) {
PubDateParser PubDateParser
.dateFromHighestYearLookingSubstring(text) .dateFromHighestYearLookingSubstring(text)
.ifPresent(this::setPubDate); .ifPresent(this::setPubDate);

View File

@ -1,10 +1,10 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -17,7 +17,7 @@ import java.util.Optional;
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic { public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
if (effortLevel == PubDateEffortLevel.LOW) if (effortLevel == PubDateEffortLevel.LOW)
return Optional.empty(); return Optional.empty();
@ -31,9 +31,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
private static class DateExtractingNodeVisitor implements NodeFilter { private static class DateExtractingNodeVisitor implements NodeFilter {
public PubDate pubDate; public PubDate pubDate;
private final EdgeHtmlStandard htmlStandard; private final HtmlStandard htmlStandard;
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) { private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
this.htmlStandard = htmlStandard; this.htmlStandard = htmlStandard;
} }
@ -71,7 +71,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
} }
private void parse(String text) { private void parse(String text) {
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) { if (htmlStandard == HtmlStandard.UNKNOWN) {
PubDateParser PubDateParser
.dateFromHighestYearLookingSubstring(text) .dateFromHighestYearLookingSubstring(text)
.ifPresent(this::setPubDate); .ifPresent(this::setPubDate);

View File

@ -0,0 +1,23 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
if (htmlStandard == HtmlStandard.UNKNOWN)
return Optional.empty();
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
}
}

View File

@ -1,11 +1,11 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.Optional; import java.util.Optional;
@ -13,7 +13,7 @@ import java.util.Optional;
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic { public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
// HTML5, alternative approach // HTML5, alternative approach
for (var tag : document.select("time")) { for (var tag : document.select("time")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));

View File

@ -1,11 +1,11 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.Optional; import java.util.Optional;
@ -13,7 +13,7 @@ import java.util.Optional;
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic { public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
// HTML5 // HTML5
for (var tag : document.select("time[pubdate=\"pubdate\"]")) { for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));

View File

@ -1,11 +1,11 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.Optional; import java.util.Optional;
@ -13,7 +13,7 @@ import java.util.Optional;
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic { public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
for (var tag : document.select("time[itemprop=\"datePublished\"]")) { for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
if (maybeDate.isPresent()) { if (maybeDate.isPresent()) {

View File

@ -1,14 +1,14 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.gson.GsonBuilder; import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException; import com.google.gson.JsonSyntaxException;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.Optional; import java.util.Optional;
@ -16,7 +16,7 @@ import java.util.Optional;
public class PubDateHeuristicJSONLD implements PubDateHeuristic { public class PubDateHeuristicJSONLD implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
for (var tag : document.select("script[type=\"application/ld+json\"]")) { for (var tag : document.select("script[type=\"application/ld+json\"]")) {
var maybeDate = parseLdJson(tag.data()) var maybeDate = parseLdJson(tag.data())
.flatMap(PubDateParser::attemptParseDate); .flatMap(PubDateParser::attemptParseDate);

View File

@ -1,11 +1,11 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.Optional; import java.util.Optional;
@ -13,7 +13,7 @@ import java.util.Optional;
public class PubDateHeuristicLastModified implements PubDateHeuristic { public class PubDateHeuristicLastModified implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
String lmString = "last-modified: "; String lmString = "last-modified: ";
int offset = headers.toLowerCase().indexOf(lmString); int offset = headers.toLowerCase().indexOf(lmString);

View File

@ -1,11 +1,11 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.Optional; import java.util.Optional;
@ -13,7 +13,7 @@ import java.util.Optional;
public class PubDateHeuristicMicrodata implements PubDateHeuristic { public class PubDateHeuristicMicrodata implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) { for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));

View File

@ -1,10 +1,10 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -13,7 +13,7 @@ import java.util.Optional;
public class PubDateHeuristicOpenGraph implements PubDateHeuristic { public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
// OG // OG
for (var tag : document.select("meta[property=\"article:published_time\"]")) { for (var tag : document.select("meta[property=\"article:published_time\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));

View File

@ -1,10 +1,10 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -13,7 +13,7 @@ import java.util.Optional;
public class PubDateHeuristicRDFaTag implements PubDateHeuristic { public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
for (var tag : document.select("meta[property=\"datePublished\"]")) { for (var tag : document.select("meta[property=\"datePublished\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
if (maybeDate.isPresent()) { if (maybeDate.isPresent()) {

View File

@ -1,11 +1,11 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.Optional; import java.util.Optional;
@ -20,7 +20,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
private static final int MIN_URL_PATTERN_YEAR = 2000; private static final int MIN_URL_PATTERN_YEAR = 2000;
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
final String urlString = url.path; final String urlString = url.path;
var matcher = yearUrlPattern.matcher(urlString); var matcher = yearUrlPattern.matcher(urlString);

View File

@ -1,11 +1,11 @@
package nu.marginalia.converting.processor.logic.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.Optional; import java.util.Optional;
@ -17,7 +17,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/"); private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
final String urlString = url.path; final String urlString = url.path;
var matcher = yearUrlPattern.matcher(urlString); var matcher = yearUrlPattern.matcher(urlString);

View File

@ -1,11 +1,9 @@
package nu.marginalia.converting.logic; package nu.marginalia.pubdate;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.converting.processor.logic.pubdate.PubDateSniffer;
import nu.marginalia.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -75,7 +73,7 @@ class PubDateSnifferTest {
<time pubdate="pubdate" datetime="2022-08-24">time</time> <time pubdate="pubdate" datetime="2022-08-24">time</time>
Wow, sure lor 'em boss Wow, sure lor 'em boss
</article> </article>
"""), EdgeHtmlStandard.UNKNOWN, true); """), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals("2022-08-24", ret.dateIso8601()); assertEquals("2022-08-24", ret.dateIso8601());
@ -91,7 +89,7 @@ class PubDateSnifferTest {
<time>2022-08-24</time> <time>2022-08-24</time>
Wow, sure lor 'em boss Wow, sure lor 'em boss
</article> </article>
"""), EdgeHtmlStandard.UNKNOWN, true); """), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals("2022-08-24", ret.dateIso8601()); assertEquals("2022-08-24", ret.dateIso8601());
@ -107,7 +105,7 @@ class PubDateSnifferTest {
<time class="published" datetime="July 13, 2006">July 13, 2006</time> <time class="published" datetime="July 13, 2006">July 13, 2006</time>
Wow, sure lor 'em boss Wow, sure lor 'em boss
</article> </article>
"""), EdgeHtmlStandard.UNKNOWN, true); """), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals(2006, ret.year()); assertEquals(2006, ret.year());
@ -117,14 +115,14 @@ class PubDateSnifferTest {
public void testProblemCases() throws IOException, URISyntaxException { public void testProblemCases() throws IOException, URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true); Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals(2006, ret.year()); assertEquals(2006, ret.year());
ret = dateSniffer.getPubDate("", ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true); Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals(2010, ret.year()); assertEquals(2010, ret.year());
@ -147,7 +145,7 @@ class PubDateSnifferTest {
<!doctype html> <!doctype html>
<html> <html>
<meta itemprop="datePublished" content="2022-08-24" /> <meta itemprop="datePublished" content="2022-08-24" />
"""), EdgeHtmlStandard.UNKNOWN, true); """), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals("2022-08-24", ret.dateIso8601()); assertEquals("2022-08-24", ret.dateIso8601());
@ -161,7 +159,7 @@ class PubDateSnifferTest {
<!doctype html> <!doctype html>
<html> <html>
<meta property="datePublished" content="2022-08-24" /> <meta property="datePublished" content="2022-08-24" />
"""),EdgeHtmlStandard.UNKNOWN, true); """), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals("2022-08-24", ret.dateIso8601()); assertEquals("2022-08-24", ret.dateIso8601());
@ -175,7 +173,7 @@ class PubDateSnifferTest {
<!doctype html> <!doctype html>
<html> <html>
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script> <script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
"""), EdgeHtmlStandard.UNKNOWN, true); """), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals("2004-08-24", ret.dateIso8601()); assertEquals("2004-08-24", ret.dateIso8601());
@ -189,7 +187,7 @@ class PubDateSnifferTest {
<!doctype html> <!doctype html>
<html> <html>
<title>No date in the HTML</title> <title>No date in the HTML</title>
"""), EdgeHtmlStandard.UNKNOWN, true); """), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertNull(ret.dateIso8601()); assertNull(ret.dateIso8601());
@ -204,7 +202,7 @@ class PubDateSnifferTest {
<!doctype html> <!doctype html>
<html> <html>
<title>No date in the HTML</title> <title>No date in the HTML</title>
"""), EdgeHtmlStandard.UNKNOWN, true); """), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals("2022-02-03", ret.dateIso8601()); assertEquals("2022-02-03", ret.dateIso8601());
@ -219,7 +217,7 @@ class PubDateSnifferTest {
<!doctype html> <!doctype html>
<html> <html>
<p>Published 2003, updated 2022</p> <p>Published 2003, updated 2022</p>
"""), EdgeHtmlStandard.HTML5, true); """), HtmlStandard.HTML5, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertNull(ret.dateIso8601()); assertNull(ret.dateIso8601());
@ -245,7 +243,7 @@ class PubDateSnifferTest {
<!doctype html> <!doctype html>
<html> <html>
<div style="float: left;">&nbsp;<b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&amp;sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span>&nbsp;<b>Posted:</b> Sun Oct 03, 2010 5:37 pm&nbsp;</div> <div style="float: left;">&nbsp;<b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&amp;sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span>&nbsp;<b>Posted:</b> Sun Oct 03, 2010 5:37 pm&nbsp;</div>
"""), EdgeHtmlStandard.UNKNOWN, true); """), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertNull(ret.dateIso8601()); assertNull(ret.dateIso8601());

View File

@ -13,7 +13,6 @@ java {
} }
dependencies { dependencies {
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:misc')
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:common:model') implementation project(':code:common:model')

View File

@ -3,7 +3,7 @@ package nu.marginalia.query_parser;
import nu.marginalia.language.WordPatterns; import nu.marginalia.language.WordPatterns;
import nu.marginalia.query_parser.token.Token; import nu.marginalia.query_parser.token.Token;
import nu.marginalia.query_parser.token.TokenType; import nu.marginalia.query_parser.token.TokenType;
import nu.marginalia.util.TransformList; import nu.marginalia.transform_list.TransformList;
import java.util.List; import java.util.List;

View File

@ -1,4 +1,4 @@
package nu.marginalia.util; package nu.marginalia.transform_list;
import java.util.List; import java.util.List;
import java.util.function.BiConsumer; import java.util.function.BiConsumer;

View File

@ -1,4 +1,4 @@
package nu.marginalia.util; package nu.marginalia.transform_list;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -5,7 +5,7 @@ import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; import nu.marginalia.model.dbcommon.DomainBlacklist;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -23,7 +23,7 @@ public class DbBrowseDomainsRandom {
this.dataSource = dataSource; this.dataSource = dataSource;
} }
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist, int set) { public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
final String q = """ final String q = """
SELECT DOMAIN_ID, DOMAIN_NAME SELECT DOMAIN_ID, DOMAIN_NAME

View File

@ -5,7 +5,7 @@ import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; import nu.marginalia.model.dbcommon.DomainBlacklist;
import nu.marginalia.model.id.EdgeId; import nu.marginalia.model.id.EdgeId;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -24,7 +24,7 @@ public class DbBrowseDomainsSimilarCosine {
this.dataSource = dataSource; this.dataSource = dataSource;
} }
public List<BrowseResult> getDomainNeighborsAdjacentCosine(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) { public List<BrowseResult> getDomainNeighborsAdjacentCosine(EdgeId<EdgeDomain> domainId, DomainBlacklist blacklist, int count) {
List<BrowseResult> domains = new ArrayList<>(count); List<BrowseResult> domains = new ArrayList<>(count);
String q = """ String q = """

View File

@ -6,7 +6,7 @@ import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; import nu.marginalia.model.dbcommon.DomainBlacklist;
import nu.marginalia.model.id.EdgeId; import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.EdgeIdCollection; import nu.marginalia.model.id.EdgeIdCollection;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -26,7 +26,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
this.dataSource = dataSource; this.dataSource = dataSource;
} }
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) { public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, DomainBlacklist blacklist, int count) {
final Set<BrowseResult> domains = new HashSet<>(count*3); final Set<BrowseResult> domains = new HashSet<>(count*3);
final String q = """ final String q = """
@ -131,7 +131,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
return new ArrayList<>(domains); return new ArrayList<>(domains);
} }
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist, int set) { public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
final String q = """ final String q = """
SELECT DOMAIN_ID, DOMAIN_NAME SELECT DOMAIN_ID, DOMAIN_NAME

Some files were not shown because too many files have changed in this diff Show More