mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Additional code restructuring to get rid of util and misc-style packages.
This commit is contained in:
parent
73e412ea5b
commit
6d939175b1
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.index.client.model.results;
|
package nu.marginalia.index.client.model.results;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
@ -26,7 +26,7 @@ public final class SearchResultKeywordScore {
|
|||||||
this.hasPriorityTerms = hasPriorityTerms;
|
this.hasPriorityTerms = hasPriorityTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasTermFlag(EdgePageWordFlags flag) {
|
private boolean hasTermFlag(WordFlags flag) {
|
||||||
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -37,7 +37,7 @@ public final class SearchResultKeywordScore {
|
|||||||
|
|
||||||
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
|
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
|
||||||
|
|
||||||
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
|
if (DocumentMetadata.hasFlags(encodedDocMetadata, DocumentFlags.Simple.asBit())) {
|
||||||
sum += 20;
|
sum += 20;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,28 +53,28 @@ public final class SearchResultKeywordScore {
|
|||||||
public double termValue() {
|
public double termValue() {
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.Title)) {
|
if (hasTermFlag(WordFlags.Title)) {
|
||||||
sum -= 15;
|
sum -= 15;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.Site)) {
|
if (hasTermFlag(WordFlags.Site)) {
|
||||||
sum -= 10;
|
sum -= 10;
|
||||||
} else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) {
|
} else if (hasTermFlag(WordFlags.SiteAdjacent)) {
|
||||||
sum -= 5;
|
sum -= 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.Subjects)) {
|
if (hasTermFlag(WordFlags.Subjects)) {
|
||||||
sum -= 10;
|
sum -= 10;
|
||||||
}
|
}
|
||||||
if (hasTermFlag(EdgePageWordFlags.NamesWords)) {
|
if (hasTermFlag(WordFlags.NamesWords)) {
|
||||||
sum -= 1;
|
sum -= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
|
if (hasTermFlag(WordFlags.UrlDomain)) {
|
||||||
sum -= 5;
|
sum -= 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
|
if (hasTermFlag(WordFlags.UrlPath)) {
|
||||||
sum -= 5;
|
sum -= 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -95,12 +95,12 @@ public final class SearchResultKeywordScore {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean isKeywordSpecial() {
|
public boolean isKeywordSpecial() {
|
||||||
return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic);
|
return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isKeywordRegular() {
|
public boolean isKeywordRegular() {
|
||||||
return !keyword.contains(":")
|
return !keyword.contains(":")
|
||||||
&& !hasTermFlag(EdgePageWordFlags.Synthetic);
|
&& !hasTermFlag(WordFlags.Synthetic);
|
||||||
}
|
}
|
||||||
|
|
||||||
public long encodedWordMetadata() {
|
public long encodedWordMetadata() {
|
||||||
|
@ -14,7 +14,6 @@ java {
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
implementation project(':code:libraries:misc')
|
|
||||||
}
|
}
|
||||||
|
|
||||||
test {
|
test {
|
||||||
|
@ -14,6 +14,7 @@ dependencies {
|
|||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
implementation project(':code:libraries:big-string')
|
implementation project(':code:libraries:big-string')
|
||||||
|
implementation project(':code:libraries:braille-block-punch-cards')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.model.crawl;
|
package nu.marginalia.model.crawl;
|
||||||
|
|
||||||
public enum EdgeDomainIndexingState {
|
public enum DomainIndexingState {
|
||||||
ACTIVE("Active"),
|
ACTIVE("Active"),
|
||||||
EXHAUSTED("Fully Crawled"),
|
EXHAUSTED("Fully Crawled"),
|
||||||
SPECIAL("Content is side-loaded"),
|
SPECIAL("Content is side-loaded"),
|
||||||
@ -12,7 +12,7 @@ public enum EdgeDomainIndexingState {
|
|||||||
|
|
||||||
public String desc;
|
public String desc;
|
||||||
|
|
||||||
EdgeDomainIndexingState(String desc) {
|
DomainIndexingState(String desc) {
|
||||||
this.desc = desc;
|
this.desc = desc;
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,15 +0,0 @@
|
|||||||
package nu.marginalia.model.crawl;
|
|
||||||
|
|
||||||
|
|
||||||
import lombok.*;
|
|
||||||
|
|
||||||
@AllArgsConstructor
|
|
||||||
@EqualsAndHashCode
|
|
||||||
@Getter
|
|
||||||
@Setter
|
|
||||||
@Builder
|
|
||||||
@ToString
|
|
||||||
public class EdgeContentType {
|
|
||||||
public final String contentType;
|
|
||||||
public final String charset;
|
|
||||||
}
|
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.model.crawl;
|
package nu.marginalia.model.crawl;
|
||||||
|
|
||||||
/** This should correspond to EC_URL.STATE */
|
/** This should correspond to EC_URL.STATE */
|
||||||
public enum EdgeUrlState {
|
public enum UrlIndexingState {
|
||||||
OK,
|
OK,
|
||||||
REDIRECT,
|
REDIRECT,
|
||||||
DEAD,
|
DEAD,
|
@ -5,8 +5,8 @@ import gnu.trove.set.hash.TIntHashSet;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.id.EdgeId;
|
import nu.marginalia.model.id.EdgeId;
|
||||||
|
|
||||||
@ImplementedBy(EdgeDomainBlacklistImpl.class)
|
@ImplementedBy(DomainBlacklistImpl.class)
|
||||||
public interface EdgeDomainBlacklist {
|
public interface DomainBlacklist {
|
||||||
boolean isBlacklisted(int domainId);
|
boolean isBlacklisted(int domainId);
|
||||||
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
|
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
|
||||||
return isBlacklisted(domainId.id());
|
return isBlacklisted(domainId.id());
|
@ -6,20 +6,19 @@ import com.zaxxer.hikari.HikariDataSource;
|
|||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
|
public class DomainBlacklistImpl implements DomainBlacklist {
|
||||||
private volatile TIntHashSet spamDomainSet = new TIntHashSet();
|
private volatile TIntHashSet spamDomainSet = new TIntHashSet();
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeDomainBlacklistImpl(HikariDataSource dataSource) {
|
public DomainBlacklistImpl(HikariDataSource dataSource) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
|
|
||||||
Schedulers.io().schedulePeriodicallyDirect(this::updateSpamList, 5, 600, TimeUnit.SECONDS);
|
Schedulers.io().schedulePeriodicallyDirect(this::updateSpamList, 5, 600, TimeUnit.SECONDS);
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.model.crawl;
|
package nu.marginalia.model.idx;
|
||||||
|
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
|
|
||||||
public enum EdgePageDocumentFlags {
|
public enum DocumentFlags {
|
||||||
/** Simple processing was done, this document should be de-prioritized as a search result */
|
/** Simple processing was done, this document should be de-prioritized as a search result */
|
||||||
Simple,
|
Simple,
|
||||||
|
|
||||||
@ -23,10 +23,10 @@ public enum EdgePageDocumentFlags {
|
|||||||
return (asBit() & value) > 0;
|
return (asBit() & value) > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static EnumSet<EdgePageDocumentFlags> decode(long encodedValue) {
|
public static EnumSet<DocumentFlags> decode(long encodedValue) {
|
||||||
EnumSet<EdgePageDocumentFlags> ret = EnumSet.noneOf(EdgePageDocumentFlags.class);
|
EnumSet<DocumentFlags> ret = EnumSet.noneOf(DocumentFlags.class);
|
||||||
|
|
||||||
for (EdgePageDocumentFlags f : values()) {
|
for (DocumentFlags f : values()) {
|
||||||
if ((encodedValue & f.asBit()) > 0) {
|
if ((encodedValue & f.asBit()) > 0) {
|
||||||
ret.add(f);
|
ret.add(f);
|
||||||
}
|
}
|
@ -1,6 +1,5 @@
|
|||||||
package nu.marginalia.model.idx;
|
package nu.marginalia.model.idx;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
|
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
@ -44,7 +43,7 @@ public record DocumentMetadata(int rank,
|
|||||||
public DocumentMetadata() {
|
public DocumentMetadata() {
|
||||||
this(defaultValue());
|
this(defaultValue());
|
||||||
}
|
}
|
||||||
public DocumentMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
public DocumentMetadata(int topology, int year, int sets, int quality, EnumSet<DocumentFlags> flags) {
|
||||||
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
|
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,13 +57,13 @@ public record DocumentMetadata(int rank,
|
|||||||
return new DocumentMetadata(rank, encSize, topology, year, sets, quality, flags);
|
return new DocumentMetadata(rank, encSize, topology, year, sets, quality, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
|
private static byte encodeFlags(Set<DocumentFlags> flags) {
|
||||||
byte ret = 0;
|
byte ret = 0;
|
||||||
for (var flag : flags) { ret |= flag.asBit(); }
|
for (var flag : flags) { ret |= flag.asBit(); }
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasFlag(EdgePageDocumentFlags flag) {
|
public boolean hasFlag(DocumentFlags flag) {
|
||||||
return (flags & flag.asBit()) != 0;
|
return (flags & flag.asBit()) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,23 +1,20 @@
|
|||||||
package nu.marginalia.model.crawl;
|
package nu.marginalia.model.idx;
|
||||||
|
|
||||||
|
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
|
|
||||||
public enum EdgePageWordFlags {
|
public enum WordFlags {
|
||||||
|
|
||||||
/** Word appears in title */
|
/** Word appears in title */
|
||||||
Title,
|
Title,
|
||||||
|
|
||||||
/** Word appears to be the subject in several sentences
|
/** Word appears to be the subject in several sentences */
|
||||||
* @see SubjectCounter */
|
|
||||||
Subjects,
|
Subjects,
|
||||||
|
|
||||||
/** Word has high tf-idf
|
/** Word has high tf-idf */
|
||||||
* @see KeywordCounter */
|
|
||||||
TfIdfHigh,
|
TfIdfHigh,
|
||||||
|
|
||||||
/** Word is a likely named object. This is a weaker version of Subjects.
|
/** Word is a likely named object. This is a weaker version of Subjects. */
|
||||||
* @see NameCounter */
|
|
||||||
NamesWords,
|
NamesWords,
|
||||||
|
|
||||||
/** The word isn't actually a word on page, but a fake keyword from the code
|
/** The word isn't actually a word on page, but a fake keyword from the code
|
||||||
@ -26,12 +23,10 @@ public enum EdgePageWordFlags {
|
|||||||
Synthetic,
|
Synthetic,
|
||||||
|
|
||||||
/** Word is important to site
|
/** Word is important to site
|
||||||
* @see SiteWords
|
|
||||||
*/
|
*/
|
||||||
Site,
|
Site,
|
||||||
|
|
||||||
/** Word is important to adjacent documents
|
/** Word is important to adjacent documents
|
||||||
* @see SiteWords
|
|
||||||
* */
|
* */
|
||||||
SiteAdjacent,
|
SiteAdjacent,
|
||||||
|
|
||||||
@ -54,10 +49,10 @@ public enum EdgePageWordFlags {
|
|||||||
return (asBit() & value) > 0;
|
return (asBit() & value) > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static EnumSet<EdgePageWordFlags> decode(long encodedValue) {
|
public static EnumSet<WordFlags> decode(long encodedValue) {
|
||||||
EnumSet<EdgePageWordFlags> ret = EnumSet.noneOf(EdgePageWordFlags.class);
|
EnumSet<WordFlags> ret = EnumSet.noneOf(WordFlags.class);
|
||||||
|
|
||||||
for (EdgePageWordFlags f : values()) {
|
for (WordFlags f : values()) {
|
||||||
if ((encodedValue & f.asBit()) > 0) {
|
if ((encodedValue & f.asBit()) > 0) {
|
||||||
ret.add(f);
|
ret.add(f);
|
||||||
}
|
}
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.model.idx;
|
package nu.marginalia.model.idx;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
|
||||||
import nu.marginalia.util.BrailleBlockPunchCards;
|
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||||
|
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
@ -39,12 +39,12 @@ public record WordMetadata(int tfIdf,
|
|||||||
|
|
||||||
public WordMetadata(int tfIdf,
|
public WordMetadata(int tfIdf,
|
||||||
int positions,
|
int positions,
|
||||||
Set<EdgePageWordFlags> flags)
|
Set<WordFlags> flags)
|
||||||
{
|
{
|
||||||
this(tfIdf, positions, encodeFlags(flags));
|
this(tfIdf, positions, encodeFlags(flags));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static byte encodeFlags(Set<EdgePageWordFlags> flags) {
|
private static byte encodeFlags(Set<WordFlags> flags) {
|
||||||
byte ret = 0;
|
byte ret = 0;
|
||||||
for (var flag : flags) { ret |= flag.asBit(); }
|
for (var flag : flags) { ret |= flag.asBit(); }
|
||||||
return ret;
|
return ret;
|
||||||
@ -64,7 +64,7 @@ public record WordMetadata(int tfIdf,
|
|||||||
return (meta >>> TF_IDF_SHIFT) & TF_IDF_MASK;
|
return (meta >>> TF_IDF_SHIFT) & TF_IDF_MASK;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasFlag(EdgePageWordFlags flag) {
|
public boolean hasFlag(WordFlags flag) {
|
||||||
return (flags & flag.asBit()) != 0;
|
return (flags & flag.asBit()) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -98,7 +98,7 @@ public record WordMetadata(int tfIdf,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public EnumSet<EdgePageWordFlags> flagSet() {
|
public EnumSet<WordFlags> flagSet() {
|
||||||
return EdgePageWordFlags.decode(flags);
|
return WordFlags.decode(flags);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,6 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
public class QueryParams {
|
public class QueryParams {
|
||||||
|
|
||||||
private static final Pattern paramSplitterPattern = Pattern.compile("&");
|
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
|
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.index.model;
|
package nu.marginalia.model;
|
||||||
|
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@ -67,7 +67,7 @@ class DocumentMetadataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void encRank() {
|
public void encRank() {
|
||||||
var meta = new DocumentMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class))
|
var meta = new DocumentMetadata(5, 22, 3, 8, EnumSet.noneOf(DocumentFlags.class))
|
||||||
.withSize(0xffffffff).encode();
|
.withSize(0xffffffff).encode();
|
||||||
var enc2 = DocumentMetadata.encodeRank(meta, 83);
|
var enc2 = DocumentMetadata.encodeRank(meta, 83);
|
||||||
|
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.model;
|
package nu.marginalia.model;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@ -12,16 +12,16 @@ class WordMetadataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void codecTest() {
|
public void codecTest() {
|
||||||
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
|
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(WordFlags.class)));
|
||||||
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
|
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(WordFlags.class)));
|
||||||
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(EdgePageWordFlags.class)));
|
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(WordFlags.class)));
|
||||||
System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(EdgePageWordFlags.class)));
|
System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(WordFlags.class)));
|
||||||
System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(EdgePageWordFlags.class)));
|
System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(WordFlags.class)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testClampTfIdfLow() {
|
public void testClampTfIdfLow() {
|
||||||
var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
|
var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(WordFlags.class));
|
||||||
var encoded = new WordMetadata(original.encode());
|
var encoded = new WordMetadata(original.encode());
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
assertEquals(original.positions(), encoded.positions());
|
||||||
@ -30,7 +30,7 @@ class WordMetadataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testClampTfIdfHigh() {
|
public void testClampTfIdfHigh() {
|
||||||
var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
|
var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(WordFlags.class));
|
||||||
var encoded = new WordMetadata(original.encode());
|
var encoded = new WordMetadata(original.encode());
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
assertEquals(original.positions(), encoded.positions());
|
||||||
|
@ -12,7 +12,6 @@ java {
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:libraries:misc')
|
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.model.crawl;
|
package nu.marginalia.crawling.common.model;
|
||||||
|
|
||||||
|
|
||||||
public enum EdgeHtmlStandard {
|
public enum HtmlStandard {
|
||||||
PLAIN(0, 1, 1993),
|
PLAIN(0, 1, 1993),
|
||||||
UNKNOWN(0, 1, 2000),
|
UNKNOWN(0, 1, 2000),
|
||||||
HTML123(0, 1, 1997),
|
HTML123(0, 1, 1997),
|
||||||
@ -18,7 +18,7 @@ public enum EdgeHtmlStandard {
|
|||||||
* */
|
* */
|
||||||
public final int yearGuess;
|
public final int yearGuess;
|
||||||
|
|
||||||
EdgeHtmlStandard(double offset, double scale, int yearGuess) {
|
HtmlStandard(double offset, double scale, int yearGuess) {
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
this.scale = scale;
|
this.scale = scale;
|
||||||
this.yearGuess = yearGuess;
|
this.yearGuess = yearGuess;
|
@ -18,6 +18,8 @@ dependencies {
|
|||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
|
|
||||||
|
implementation project(':code:crawl:common')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.instruction;
|
|||||||
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.converting.model.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
||||||
@ -15,7 +15,7 @@ public interface Interpreter {
|
|||||||
void loadRssFeed(EdgeUrl[] rssFeed);
|
void loadRssFeed(EdgeUrl[] rssFeed);
|
||||||
void loadDomainLink(DomainLink[] links);
|
void loadDomainLink(DomainLink[] links);
|
||||||
|
|
||||||
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip);
|
void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip);
|
||||||
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
|
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
|
||||||
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
|
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.instruction.instructions;
|
package nu.marginalia.converting.instruction.instructions;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.EdgeUrlState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.converting.instruction.Instruction;
|
import nu.marginalia.converting.instruction.Instruction;
|
||||||
import nu.marginalia.converting.instruction.InstructionTag;
|
import nu.marginalia.converting.instruction.InstructionTag;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
@ -10,11 +10,11 @@ import org.jetbrains.annotations.Nullable;
|
|||||||
|
|
||||||
|
|
||||||
public record LoadProcessedDocument(EdgeUrl url,
|
public record LoadProcessedDocument(EdgeUrl url,
|
||||||
EdgeUrlState state,
|
UrlIndexingState state,
|
||||||
String title,
|
String title,
|
||||||
String description,
|
String description,
|
||||||
int htmlFeatures,
|
int htmlFeatures,
|
||||||
EdgeHtmlStandard standard,
|
HtmlStandard standard,
|
||||||
int length,
|
int length,
|
||||||
long hash,
|
long hash,
|
||||||
double quality,
|
double quality,
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.converting.instruction.instructions;
|
package nu.marginalia.converting.instruction.instructions;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeUrlState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.converting.instruction.Instruction;
|
import nu.marginalia.converting.instruction.Instruction;
|
||||||
import nu.marginalia.converting.instruction.InstructionTag;
|
import nu.marginalia.converting.instruction.InstructionTag;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
@ -8,7 +8,7 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
|
|
||||||
|
|
||||||
public record LoadProcessedDocumentWithError(EdgeUrl url,
|
public record LoadProcessedDocumentWithError(EdgeUrl url,
|
||||||
EdgeUrlState state,
|
UrlIndexingState state,
|
||||||
String reason) implements Instruction
|
String reason) implements Instruction
|
||||||
{
|
{
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
package nu.marginalia.converting.instruction.instructions;
|
package nu.marginalia.converting.instruction.instructions;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.converting.instruction.Instruction;
|
import nu.marginalia.converting.instruction.Instruction;
|
||||||
import nu.marginalia.converting.instruction.InstructionTag;
|
import nu.marginalia.converting.instruction.InstructionTag;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
|
||||||
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction {
|
public record LoadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) implements Instruction {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void apply(Interpreter interpreter) {
|
public void apply(Interpreter interpreter) {
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.converting.model;
|
|||||||
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
@ -54,14 +54,14 @@ public class DocumentKeywordsBuilder {
|
|||||||
words.putIfAbsent(word, 0);
|
words.putIfAbsent(word, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) {
|
public void setFlagOnMetadataForWords(WordFlags flag, Set<String> flagWords) {
|
||||||
flagWords.forEach(word ->
|
flagWords.forEach(word ->
|
||||||
words.mergeLong(word, flag.asBit(), (a, b) -> a|b)
|
words.mergeLong(word, flag.asBit(), (a, b) -> a|b)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addAllSyntheticTerms(Collection<String> newWords) {
|
public void addAllSyntheticTerms(Collection<String> newWords) {
|
||||||
long meta = EdgePageWordFlags.Synthetic.asBit();
|
long meta = WordFlags.Synthetic.asBit();
|
||||||
|
|
||||||
newWords.forEach(word -> {
|
newWords.forEach(word -> {
|
||||||
words.putIfAbsent(word, meta);
|
words.putIfAbsent(word, meta);
|
||||||
|
@ -20,20 +20,27 @@ tasks.distZip.enabled = false
|
|||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
implementation project(':third-party')
|
||||||
|
implementation project(':code:api:index-api')
|
||||||
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service-discovery')
|
||||||
|
implementation project(':code:common:service-client')
|
||||||
|
|
||||||
implementation project(':code:libraries:guarded-regex')
|
implementation project(':code:libraries:guarded-regex')
|
||||||
implementation project(':code:libraries:easy-lsh')
|
implementation project(':code:libraries:easy-lsh')
|
||||||
implementation project(':code:libraries:big-string')
|
implementation project(':code:libraries:big-string')
|
||||||
implementation project(':code:api:index-api')
|
|
||||||
implementation project(':code:common:service-discovery')
|
|
||||||
implementation project(':code:common:service-client')
|
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
|
|
||||||
implementation project(':code:crawl:common')
|
implementation project(':code:crawl:common')
|
||||||
implementation project(':code:crawl:converting-model')
|
implementation project(':code:crawl:converting-model')
|
||||||
implementation project(':code:crawl:crawling-model')
|
implementation project(':code:crawl:crawling-model')
|
||||||
|
|
||||||
|
implementation project(':code:features:adblock')
|
||||||
|
implementation project(':code:features:pubdate')
|
||||||
|
implementation project(':code:features:topic-detection')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting;
|
package nu.marginalia.converting;
|
||||||
|
|
||||||
import com.github.luben.zstd.ZstdOutputStream;
|
import com.github.luben.zstd.ZstdOutputStream;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
import nu.marginalia.converting.model.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
@ -49,7 +49,7 @@ public class ConversionLog implements AutoCloseable, Interpreter {
|
|||||||
public void loadDomainLink(DomainLink[] links) {}
|
public void loadDomainLink(DomainLink[] links) {}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {}
|
public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
|
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting;
|
|||||||
|
|
||||||
import com.github.luben.zstd.ZstdOutputStream;
|
import com.github.luben.zstd.ZstdOutputStream;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.converting.instruction.Instruction;
|
import nu.marginalia.converting.instruction.Instruction;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
@ -106,7 +106,7 @@ public class InstructionWriter {
|
|||||||
public void loadDomainLink(DomainLink[] links) {}
|
public void loadDomainLink(DomainLink[] links) {}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {
|
||||||
this.domainName = domain.toString();
|
this.domainName = domain.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.converting.model;
|
package nu.marginalia.converting.model;
|
||||||
|
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.crawl.EdgeUrlState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import java.util.OptionalDouble;
|
import java.util.OptionalDouble;
|
||||||
@ -14,11 +14,11 @@ public class ProcessedDocument {
|
|||||||
public ProcessedDocumentDetails details;
|
public ProcessedDocumentDetails details;
|
||||||
public DocumentKeywordsBuilder words;
|
public DocumentKeywordsBuilder words;
|
||||||
|
|
||||||
public EdgeUrlState state;
|
public UrlIndexingState state;
|
||||||
public String stateReason;
|
public String stateReason;
|
||||||
|
|
||||||
public boolean isOk() {
|
public boolean isOk() {
|
||||||
return EdgeUrlState.OK == state;
|
return UrlIndexingState.OK == state;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isProcessedFully() {
|
public boolean isProcessedFully() {
|
||||||
@ -28,7 +28,7 @@ public class ProcessedDocument {
|
|||||||
if (details == null)
|
if (details == null)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return !details.metadata.hasFlag(EdgePageDocumentFlags.Simple);
|
return !details.metadata.hasFlag(DocumentFlags.Simple);
|
||||||
}
|
}
|
||||||
|
|
||||||
public OptionalDouble quality() {
|
public OptionalDouble quality() {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.model;
|
package nu.marginalia.converting.model;
|
||||||
|
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
@ -23,7 +23,7 @@ public class ProcessedDocumentDetails {
|
|||||||
public long hashCode;
|
public long hashCode;
|
||||||
|
|
||||||
public Set<HtmlFeature> features;
|
public Set<HtmlFeature> features;
|
||||||
public EdgeHtmlStandard standard;
|
public HtmlStandard standard;
|
||||||
|
|
||||||
public List<EdgeUrl> linksInternal;
|
public List<EdgeUrl> linksInternal;
|
||||||
public List<EdgeUrl> linksExternal;
|
public List<EdgeUrl> linksExternal;
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.model;
|
|||||||
|
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -13,7 +13,7 @@ public class ProcessedDomain {
|
|||||||
public EdgeDomain domain;
|
public EdgeDomain domain;
|
||||||
|
|
||||||
public List<ProcessedDocument> documents;
|
public List<ProcessedDocument> documents;
|
||||||
public EdgeDomainIndexingState state;
|
public DomainIndexingState state;
|
||||||
public EdgeDomain redirect;
|
public EdgeDomain redirect;
|
||||||
public String ip;
|
public String ip;
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import com.google.inject.Inject;
|
|||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.model.crawl.EdgeUrlState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
||||||
@ -45,12 +45,12 @@ public class DocumentProcessor {
|
|||||||
processDocument(crawledDocument, crawledDomain, ret);
|
processDocument(crawledDocument, crawledDomain, ret);
|
||||||
}
|
}
|
||||||
catch (DisqualifiedException ex) {
|
catch (DisqualifiedException ex) {
|
||||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||||
ret.stateReason = ex.reason.toString();
|
ret.stateReason = ex.reason.toString();
|
||||||
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||||
ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString();
|
ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString();
|
||||||
logger.info("Failed to convert " + crawledDocument.url, ex);
|
logger.info("Failed to convert " + crawledDocument.url, ex);
|
||||||
ex.printStackTrace();
|
ex.printStackTrace();
|
||||||
@ -125,11 +125,11 @@ public class DocumentProcessor {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {
|
private UrlIndexingState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {
|
||||||
return switch (CrawlerDocumentStatus.valueOf(crawlerStatus)) {
|
return switch (CrawlerDocumentStatus.valueOf(crawlerStatus)) {
|
||||||
case OK -> httpStatus < 300 ? EdgeUrlState.OK : EdgeUrlState.DEAD;
|
case OK -> httpStatus < 300 ? UrlIndexingState.OK : UrlIndexingState.DEAD;
|
||||||
case REDIRECT -> EdgeUrlState.REDIRECT;
|
case REDIRECT -> UrlIndexingState.REDIRECT;
|
||||||
default -> EdgeUrlState.DEAD;
|
default -> UrlIndexingState.DEAD;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,11 +6,10 @@ import nu.marginalia.crawling.model.CrawledDocument;
|
|||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.util.StringPool;
|
|
||||||
import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
|
import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
|
||||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||||
|
|
||||||
@ -134,12 +133,12 @@ public class DomainProcessor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
private DomainIndexingState getState(String crawlerStatus) {
|
||||||
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
|
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
|
||||||
case OK -> EdgeDomainIndexingState.ACTIVE;
|
case OK -> DomainIndexingState.ACTIVE;
|
||||||
case REDIRECT -> EdgeDomainIndexingState.REDIR;
|
case REDIRECT -> DomainIndexingState.REDIR;
|
||||||
case BLOCKED -> EdgeDomainIndexingState.BLOCKED;
|
case BLOCKED -> DomainIndexingState.BLOCKED;
|
||||||
default -> EdgeDomainIndexingState.ERROR;
|
default -> DomainIndexingState.ERROR;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor;
|
package nu.marginalia.converting.processor;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
@ -24,7 +24,7 @@ public class SiteWords {
|
|||||||
Map<EdgeUrl, Set<String>> linkedKeywords = getAdjacentWords(internalLinkGraph);
|
Map<EdgeUrl, Set<String>> linkedKeywords = getAdjacentWords(internalLinkGraph);
|
||||||
|
|
||||||
for (var doc : processedDomain.documents) {
|
for (var doc : processedDomain.documents) {
|
||||||
applyKeywordsToDoc(doc, EdgePageWordFlags.SiteAdjacent, linkedKeywords.get(doc.url));
|
applyKeywordsToDoc(doc, WordFlags.SiteAdjacent, linkedKeywords.get(doc.url));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -33,17 +33,17 @@ public class SiteWords {
|
|||||||
Set<String> commonSiteWords = new HashSet<>(10);
|
Set<String> commonSiteWords = new HashSet<>(10);
|
||||||
|
|
||||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
||||||
EdgePageWordFlags.Subjects));
|
WordFlags.Subjects));
|
||||||
|
|
||||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
||||||
EdgePageWordFlags.Title));
|
WordFlags.Title));
|
||||||
|
|
||||||
if (commonSiteWords.isEmpty()) {
|
if (commonSiteWords.isEmpty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var doc : processedDomain.documents) {
|
for (var doc : processedDomain.documents) {
|
||||||
applyKeywordsToDoc(doc, EdgePageWordFlags.Site, commonSiteWords);
|
applyKeywordsToDoc(doc, WordFlags.Site, commonSiteWords);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -74,7 +74,7 @@ public class SiteWords {
|
|||||||
return linkedKeywords;
|
return linkedKeywords;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void applyKeywordsToDoc(ProcessedDocument doc, EdgePageWordFlags flag, Set<String> words) {
|
private void applyKeywordsToDoc(ProcessedDocument doc, WordFlags flag, Set<String> words) {
|
||||||
if (doc.words != null && words != null) {
|
if (doc.words != null && words != null) {
|
||||||
doc.words.setFlagOnMetadataForWords(flag, words);
|
doc.words.setFlagOnMetadataForWords(flag, words);
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ import nu.marginalia.language.keywords.KeywordExtractor;
|
|||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.language.model.KeywordMetadata;
|
import nu.marginalia.language.model.KeywordMetadata;
|
||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
|
|
||||||
@ -22,7 +22,7 @@ public class SimpleKeywords {
|
|||||||
KeywordMetadata metadata,
|
KeywordMetadata metadata,
|
||||||
DocumentLanguageData documentLanguageData) {
|
DocumentLanguageData documentLanguageData) {
|
||||||
|
|
||||||
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
|
EnumSet<WordFlags> flagsTemplate = EnumSet.noneOf(WordFlags.class);
|
||||||
|
|
||||||
for (var sent : documentLanguageData.sentences) {
|
for (var sent : documentLanguageData.sentences) {
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.logic;
|
|||||||
|
|
||||||
import crawlercommons.utils.Strings;
|
import crawlercommons.utils.Strings;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@ -22,7 +22,7 @@ public class DocumentValuator {
|
|||||||
|
|
||||||
);
|
);
|
||||||
|
|
||||||
public double getQuality(CrawledDocument crawledDocument, EdgeHtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
|
public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
|
||||||
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
|
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
|
||||||
double scriptPenalty = getScriptPenalty(parsedDocument);
|
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||||
|
|
||||||
|
@ -2,10 +2,14 @@ package nu.marginalia.converting.processor.logic;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.adblock.AdblockSimulator;
|
||||||
|
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.converting.processor.logic.topic.*;
|
import nu.marginalia.topic.RecipeDetector;
|
||||||
|
import nu.marginalia.topic.TextileCraftDetector;
|
||||||
|
import nu.marginalia.topic.WoodworkingDetector;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.jsoup.select.Elements;
|
import org.jsoup.select.Elements;
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic;
|
||||||
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.DocumentType;
|
import org.jsoup.nodes.DocumentType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -12,53 +12,53 @@ public class HtmlStandardExtractor {
|
|||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
|
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
|
||||||
|
|
||||||
public static EdgeHtmlStandard parseDocType(DocumentType docType) {
|
public static HtmlStandard parseDocType(DocumentType docType) {
|
||||||
if (null == docType) {
|
if (null == docType) {
|
||||||
return EdgeHtmlStandard.UNKNOWN;
|
return HtmlStandard.UNKNOWN;
|
||||||
}
|
}
|
||||||
String publicId = docType.publicId();
|
String publicId = docType.publicId();
|
||||||
if (Strings.isNullOrEmpty(publicId))
|
if (Strings.isNullOrEmpty(publicId))
|
||||||
return EdgeHtmlStandard.HTML5;
|
return HtmlStandard.HTML5;
|
||||||
|
|
||||||
publicId = publicId.toUpperCase();
|
publicId = publicId.toUpperCase();
|
||||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
|
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
|
||||||
return EdgeHtmlStandard.HTML4;
|
return HtmlStandard.HTML4;
|
||||||
}
|
}
|
||||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
|
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
}
|
}
|
||||||
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
|
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
|
if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-//SQ//DTD HTML 2"))
|
if (publicId.startsWith("-//SQ//DTD HTML 2"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
|
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
|
if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML 2"))
|
if (publicId.startsWith("-//IETF//DTD HTML 2"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML//EN"))
|
if (publicId.startsWith("-//IETF//DTD HTML//EN"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-/W3C//DTD HTML 3"))
|
if (publicId.startsWith("-/W3C//DTD HTML 3"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-/W3C/DTD HTML 3"))
|
if (publicId.startsWith("-/W3C/DTD HTML 3"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML 3"))
|
if (publicId.startsWith("-//IETF//DTD HTML 3"))
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
if (publicId.startsWith("-//W3C//DTD XHTML"))
|
if (publicId.startsWith("-//W3C//DTD XHTML"))
|
||||||
return EdgeHtmlStandard.XHTML;
|
return HtmlStandard.XHTML;
|
||||||
if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
|
if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
|
||||||
return EdgeHtmlStandard.XHTML;
|
return HtmlStandard.XHTML;
|
||||||
if (publicId.startsWith("-//W3C//DTD HTML"))
|
if (publicId.startsWith("-//W3C//DTD HTML"))
|
||||||
return EdgeHtmlStandard.HTML4;
|
return HtmlStandard.HTML4;
|
||||||
|
|
||||||
logger.debug("Unknown publicID standard {}", publicId);
|
logger.debug("Unknown publicID standard {}", publicId);
|
||||||
return EdgeHtmlStandard.UNKNOWN;
|
return HtmlStandard.UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static EdgeHtmlStandard sniffHtmlStandard(Document parsed) {
|
public static HtmlStandard sniffHtmlStandard(Document parsed) {
|
||||||
int html4Attributes = 0;
|
int html4Attributes = 0;
|
||||||
int html5Attributes = 0;
|
int html5Attributes = 0;
|
||||||
|
|
||||||
@ -72,11 +72,11 @@ public class HtmlStandardExtractor {
|
|||||||
html4Attributes++;
|
html4Attributes++;
|
||||||
}
|
}
|
||||||
if (html5Attributes > 0) {
|
if (html5Attributes > 0) {
|
||||||
return EdgeHtmlStandard.HTML5;
|
return HtmlStandard.HTML5;
|
||||||
}
|
}
|
||||||
if (html4Attributes > 0) {
|
if (html4Attributes > 0) {
|
||||||
return EdgeHtmlStandard.HTML4;
|
return HtmlStandard.HTML4;
|
||||||
}
|
}
|
||||||
return EdgeHtmlStandard.HTML123;
|
return HtmlStandard.HTML123;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic;
|
||||||
|
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.model.crawl.EdgeUrlState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.lsh.EasyLSH;
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -52,7 +52,7 @@ public class LshDocumentDeduplicator {
|
|||||||
{
|
{
|
||||||
logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url);
|
logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url);
|
||||||
|
|
||||||
otherDoc.state = EdgeUrlState.DISQUALIFIED;
|
otherDoc.state = UrlIndexingState.DISQUALIFIED;
|
||||||
otherDoc.stateReason = "Duplicate";
|
otherDoc.stateReason = "Duplicate";
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.logic.links;
|
package nu.marginalia.converting.processor.logic.links;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -16,7 +16,7 @@ public class CommonKeywordExtractor {
|
|||||||
|
|
||||||
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
||||||
|
|
||||||
public List<String> getCommonSiteWords(ProcessedDomain ret, EdgePageWordFlags... flags) {
|
public List<String> getCommonSiteWords(ProcessedDomain ret, WordFlags... flags) {
|
||||||
|
|
||||||
if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS)
|
if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS)
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
@ -27,7 +27,7 @@ public class CommonKeywordExtractor {
|
|||||||
final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10);
|
final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10);
|
||||||
|
|
||||||
int qualifiedDocCount = 0;
|
int qualifiedDocCount = 0;
|
||||||
long wordFlags = Arrays.stream(flags).mapToInt(EdgePageWordFlags::asBit).reduce(0, (a,b) -> a|b);
|
long wordFlags = Arrays.stream(flags).mapToInt(WordFlags::asBit).reduce(0, (a, b) -> a|b);
|
||||||
for (var doc : ret.documents) {
|
for (var doc : ret.documents) {
|
||||||
if (doc.words == null)
|
if (doc.words == null)
|
||||||
continue;
|
continue;
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.converting.processor.logic.links;
|
package nu.marginalia.converting.processor.logic.links;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
@ -22,7 +22,7 @@ public class InternalLinkGraph {
|
|||||||
internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal));
|
internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal));
|
||||||
knownUrls.addAll(doc.details.linksInternal);
|
knownUrls.addAll(doc.details.linksInternal);
|
||||||
|
|
||||||
List<String> topKeywords = doc.words.getWordsWithAnyFlag(EdgePageWordFlags.TfIdfHigh.asBit() | EdgePageWordFlags.Subjects.asBit());
|
List<String> topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.TfIdfHigh.asBit() | WordFlags.Subjects.asBit());
|
||||||
|
|
||||||
topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords));
|
topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords));
|
||||||
candidateKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords));
|
candidateKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords));
|
||||||
|
@ -1,6 +0,0 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate;
|
|
||||||
|
|
||||||
public enum PubDateEffortLevel {
|
|
||||||
LOW,
|
|
||||||
HIGH
|
|
||||||
}
|
|
@ -1,23 +0,0 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
|
||||||
if (htmlStandard == EdgeHtmlStandard.UNKNOWN)
|
|
||||||
return Optional.empty();
|
|
||||||
|
|
||||||
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -4,7 +4,7 @@ import nu.marginalia.crawling.model.CrawledDocument;
|
|||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.language.LanguageFilter;
|
import nu.marginalia.language.LanguageFilter;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
@ -56,7 +56,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public MetaTagsBuilder addFormat(EdgeHtmlStandard standard) {
|
public MetaTagsBuilder addFormat(HtmlStandard standard) {
|
||||||
tagWords.add("format:"+standard.toString().toLowerCase());
|
tagWords.add("format:"+standard.toString().toLowerCase());
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -7,23 +7,22 @@ import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
|
|||||||
import nu.marginalia.crawling.common.link.LinkParser;
|
import nu.marginalia.crawling.common.link.LinkParser;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.language.model.KeywordMetadata;
|
|
||||||
import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
|
import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.converting.processor.logic.*;
|
import nu.marginalia.converting.processor.logic.*;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateSniffer;
|
|
||||||
import nu.marginalia.gregex.GuardedRegex;
|
import nu.marginalia.gregex.GuardedRegex;
|
||||||
import nu.marginalia.gregex.GuardedRegexFactory;
|
import nu.marginalia.gregex.GuardedRegexFactory;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.pubdate.PubDateSniffer;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
@ -120,7 +119,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
ret.hashCode = dld.localitySensitiveHashCode();
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
|
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(DocumentFlags.class));
|
||||||
|
|
||||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||||
|
|
||||||
@ -262,10 +261,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
words.addAllSyntheticTerms(linkTerms);
|
words.addAllSyntheticTerms(linkTerms);
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgeHtmlStandard getHtmlStandard(Document doc) {
|
private HtmlStandard getHtmlStandard(Document doc) {
|
||||||
EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
|
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
|
||||||
|
|
||||||
if (EdgeHtmlStandard.UNKNOWN.equals(htmlStandard)) {
|
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
|
||||||
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
||||||
}
|
}
|
||||||
return htmlStandard;
|
return htmlStandard;
|
||||||
|
@ -4,11 +4,10 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.language.model.KeywordMetadata;
|
|
||||||
import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
|
import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
@ -16,7 +15,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
|
|||||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||||
import nu.marginalia.converting.processor.logic.PlainTextLogic;
|
import nu.marginalia.converting.processor.logic.PlainTextLogic;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.util.LineUtils;
|
import nu.marginalia.converting.util.LineUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
@ -78,7 +77,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);
|
List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);
|
||||||
|
|
||||||
ret.length = documentBody.length();
|
ret.length = documentBody.length();
|
||||||
ret.standard = EdgeHtmlStandard.PLAIN;
|
ret.standard = HtmlStandard.PLAIN;
|
||||||
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
||||||
|
|
||||||
ret.quality = -1;
|
ret.quality = -1;
|
||||||
@ -89,7 +88,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
||||||
|
|
||||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
|
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(DocumentFlags.PlainText));
|
||||||
|
|
||||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.converting.util;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.logic;
|
package nu.marginalia.converting.logic;
|
||||||
|
|
||||||
import nu.marginalia.converting.processor.logic.PlainTextLogic;
|
import nu.marginalia.converting.processor.logic.PlainTextLogic;
|
||||||
import nu.marginalia.util.LineUtils;
|
import nu.marginalia.converting.util.LineUtils;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@ package nu.marginalia.converting.processor.keywords;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.language.WordPatterns;
|
import nu.marginalia.language.WordPatterns;
|
||||||
import nu.marginalia.language.model.KeywordMetadata;
|
|
||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
import nu.marginalia.language.model.WordSpan;
|
import nu.marginalia.language.model.WordSpan;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
@ -12,10 +11,7 @@ import nu.marginalia.language.keywords.KeywordExtractor;
|
|||||||
import nu.marginalia.language.model.WordSeparator;
|
import nu.marginalia.language.model.WordSeparator;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
|
||||||
import nu.marginalia.test.util.TestLanguageModels;
|
import nu.marginalia.test.util.TestLanguageModels;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
@ -27,7 +23,6 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.IntStream;
|
|
||||||
|
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
class SentenceExtractorTest {
|
class SentenceExtractorTest {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.converting.util;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
@ -5,9 +5,8 @@ import com.google.common.hash.Hashing;
|
|||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.crawling.model.CrawlingSpecification;
|
import nu.marginalia.crawling.model.CrawlingSpecification;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
|
||||||
|
|
||||||
import java.sql.Connection;
|
|
||||||
import java.sql.ResultSet;
|
import java.sql.ResultSet;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -69,11 +68,11 @@ public class CrawlJobDomainExtractor {
|
|||||||
""";
|
""";
|
||||||
|
|
||||||
|
|
||||||
private final EdgeDomainBlacklistImpl blacklist;
|
private final DomainBlacklistImpl blacklist;
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||||
|
|
||||||
public CrawlJobDomainExtractor(EdgeDomainBlacklistImpl blacklist, HikariDataSource dataSource) {
|
public CrawlJobDomainExtractor(DomainBlacklistImpl blacklist, HikariDataSource dataSource) {
|
||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.crawl;
|
|||||||
|
|
||||||
import nu.marginalia.crawling.model.CrawlingSpecification;
|
import nu.marginalia.crawling.model.CrawlingSpecification;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -37,7 +37,7 @@ public class CrawlJobExtractorMain {
|
|||||||
|
|
||||||
private static Stream<CrawlingSpecification> streamSpecs(String[] targetDomains) {
|
private static Stream<CrawlingSpecification> streamSpecs(String[] targetDomains) {
|
||||||
var ds = new DatabaseModule().provideConnection();
|
var ds = new DatabaseModule().provideConnection();
|
||||||
var domainExtractor = new CrawlJobDomainExtractor(new EdgeDomainBlacklistImpl(ds), ds);
|
var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds);
|
||||||
|
|
||||||
if (targetDomains.length > 0) {
|
if (targetDomains.length > 0) {
|
||||||
return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain);
|
return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain);
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
package nu.marginalia.crawling.model;
|
||||||
|
|
||||||
|
|
||||||
|
public record ContentType(String contentType, String charset) {
|
||||||
|
}
|
@ -9,7 +9,7 @@ import lombok.SneakyThrows;
|
|||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.model.crawl.EdgeContentType;
|
import nu.marginalia.crawling.model.ContentType;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.bigstring.BigString;
|
import nu.marginalia.bigstring.BigString;
|
||||||
@ -257,11 +257,11 @@ public class HttpFetcher {
|
|||||||
byte[] data = byteStream.readNBytes(maxFetchSize);
|
byte[] data = byteStream.readNBytes(maxFetchSize);
|
||||||
|
|
||||||
var contentType = ContentTypeParser.parse(contentTypeHeader, data);
|
var contentType = ContentTypeParser.parse(contentTypeHeader, data);
|
||||||
if (!contentTypeLogic.isAllowableContentType(contentType.contentType)) {
|
if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) {
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
if ("Shift_JIS".equalsIgnoreCase(contentType.charset)) {
|
if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) {
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, "");
|
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -280,10 +280,10 @@ public class HttpFetcher {
|
|||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getStringData(byte[] data, EdgeContentType contentType) {
|
private String getStringData(byte[] data, ContentType contentType) {
|
||||||
Charset charset;
|
Charset charset;
|
||||||
try {
|
try {
|
||||||
charset = Charset.forName(contentType.charset);
|
charset = Charset.forName(contentType.charset());
|
||||||
}
|
}
|
||||||
catch (IllegalCharsetNameException ex) {
|
catch (IllegalCharsetNameException ex) {
|
||||||
charset = StandardCharsets.UTF_8;
|
charset = StandardCharsets.UTF_8;
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.crawl.retreival.logic;
|
package nu.marginalia.crawl.retreival.logic;
|
||||||
|
|
||||||
import crawlercommons.mimetypes.MimeTypeDetector;
|
import crawlercommons.mimetypes.MimeTypeDetector;
|
||||||
import nu.marginalia.model.crawl.EdgeContentType;
|
import nu.marginalia.crawling.model.ContentType;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -11,25 +11,25 @@ public class ContentTypeParser {
|
|||||||
|
|
||||||
static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();
|
static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();
|
||||||
|
|
||||||
public static EdgeContentType parse(String contentType, byte[] data) {
|
public static ContentType parse(String contentType, byte[] data) {
|
||||||
return getContentTypeFromContentTypeString(contentType)
|
return getContentTypeFromContentTypeString(contentType)
|
||||||
.or(() -> getContentTypeStringFromTag(data))
|
.or(() -> getContentTypeStringFromTag(data))
|
||||||
.orElseGet(() -> {
|
.orElseGet(() -> {
|
||||||
Optional<String> charset = getCharsetFromTag(data);
|
Optional<String> charset = getCharsetFromTag(data);
|
||||||
return new EdgeContentType(
|
return new ContentType(
|
||||||
Optional.ofNullable(contentType)
|
Optional.ofNullable(contentType)
|
||||||
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(data)))
|
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(data)))
|
||||||
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1"));
|
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1"));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Optional<EdgeContentType> getContentTypeFromContentTypeString(String contentType) {
|
private static Optional<ContentType> getContentTypeFromContentTypeString(String contentType) {
|
||||||
if (contentType != null && contentType.contains(";")) {
|
if (contentType != null && contentType.contains(";")) {
|
||||||
var parts = contentType.split(";");
|
var parts = contentType.split(";");
|
||||||
var content = parts[0].trim();
|
var content = parts[0].trim();
|
||||||
var extra = parts[1].trim();
|
var extra = parts[1].trim();
|
||||||
if (extra.startsWith("charset=")) {
|
if (extra.startsWith("charset=")) {
|
||||||
return Optional.of(new EdgeContentType(content, extra.substring("charset=".length())));
|
return Optional.of(new ContentType(content, extra.substring("charset=".length())));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
@ -53,7 +53,7 @@ public class ContentTypeParser {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Optional<EdgeContentType> getContentTypeStringFromTag(byte[] data) {
|
private static Optional<ContentType> getContentTypeStringFromTag(byte[] data) {
|
||||||
String header = new String(Arrays.copyOf(data, Math.min(1024, data.length)));
|
String header = new String(Arrays.copyOf(data, Math.min(1024, data.length)));
|
||||||
var doc = Jsoup.parse(header);
|
var doc = Jsoup.parse(header);
|
||||||
for (var metaTag : doc.getElementsByTag("meta")) {
|
for (var metaTag : doc.getElementsByTag("meta")) {
|
||||||
|
@ -24,6 +24,8 @@ dependencies {
|
|||||||
implementation project(':code:crawl:common')
|
implementation project(':code:crawl:common')
|
||||||
implementation project(':code:crawl:crawling-model')
|
implementation project(':code:crawl:crawling-model')
|
||||||
implementation project(':code:crawl:converting-process')
|
implementation project(':code:crawl:converting-process')
|
||||||
|
implementation project(':code:features:adblock')
|
||||||
|
implementation project(':code:features:topic-detection')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.experimental;
|
package nu.marginalia.experimental;
|
||||||
|
|
||||||
|
import nu.marginalia.adblock.AdblockSimulator;
|
||||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||||
import nu.marginalia.converting.processor.logic.topic.AdblockSimulator;
|
|
||||||
import nu.marginalia.crawling.common.plan.CrawlPlanLoader;
|
import nu.marginalia.crawling.common.plan.CrawlPlanLoader;
|
||||||
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
|
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
@ -10,10 +10,10 @@ import nu.marginalia.converting.processor.logic.DomPruningFilter;
|
|||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.converting.processor.logic.topic.GoogleAnwersSpamDetector;
|
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||||
import nu.marginalia.converting.processor.logic.topic.RecipeDetector;
|
import nu.marginalia.topic.RecipeDetector;
|
||||||
import nu.marginalia.converting.processor.logic.topic.TextileCraftDetector;
|
import nu.marginalia.topic.TextileCraftDetector;
|
||||||
import nu.marginalia.converting.processor.logic.topic.WoodworkingDetector;
|
import nu.marginalia.topic.WoodworkingDetector;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.experimental;
|
package nu.marginalia.experimental;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.adblock.AdblockSimulator;
|
||||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||||
import nu.marginalia.converting.processor.logic.topic.AdblockSimulator;
|
|
||||||
import nu.marginalia.crawling.common.plan.CrawlPlanLoader;
|
import nu.marginalia.crawling.common.plan.CrawlPlanLoader;
|
||||||
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
|
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
@ -28,7 +28,6 @@ dependencies {
|
|||||||
implementation project(':code:index:lexicon')
|
implementation project(':code:index:lexicon')
|
||||||
implementation project(':code:index:index-journal')
|
implementation project(':code:index:index-journal')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
implementation project(':code:libraries:misc')
|
|
||||||
|
|
||||||
testImplementation project(':code:services-core:search-service')
|
testImplementation project(':code:services-core:search-service')
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.loading.loader;
|
|||||||
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
import nu.marginalia.converting.model.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
@ -76,7 +76,7 @@ public class Loader implements Interpreter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {
|
||||||
sqlLoadProcessedDomain.load(data, domain, state, ip);
|
sqlLoadProcessedDomain.load(data, domain, state, ip);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.loading.loader;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -42,7 +42,7 @@ public class SqlLoadProcessedDomain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
public void load(LoaderData data, EdgeDomain domain, DomainIndexingState state, String ip) {
|
||||||
data.setTargetDomain(domain);
|
data.setTargetDomain(domain);
|
||||||
|
|
||||||
loadDomains.load(data, domain);
|
loadDomains.load(data, domain);
|
||||||
|
@ -9,8 +9,8 @@ import nu.marginalia.loading.loader.SqlLoadProcessedDocument;
|
|||||||
import nu.marginalia.loading.loader.SqlLoadUrls;
|
import nu.marginalia.loading.loader.SqlLoadUrls;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.EdgeUrlState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.id.EdgeIdArray;
|
import nu.marginalia.model.id.EdgeIdArray;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
@ -69,11 +69,11 @@ class SqlLoadProcessedDocumentTest {
|
|||||||
|
|
||||||
loader.load(loaderData, List.of(new LoadProcessedDocument(
|
loader.load(loaderData, List.of(new LoadProcessedDocument(
|
||||||
url,
|
url,
|
||||||
EdgeUrlState.OK,
|
UrlIndexingState.OK,
|
||||||
"TITLE",
|
"TITLE",
|
||||||
"DESCR",
|
"DESCR",
|
||||||
HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)),
|
HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)),
|
||||||
EdgeHtmlStandard.HTML5,
|
HtmlStandard.HTML5,
|
||||||
100,
|
100,
|
||||||
12345,
|
12345,
|
||||||
-3.14,
|
-3.14,
|
||||||
|
@ -6,7 +6,7 @@ import nu.marginalia.loading.loader.SqlLoadDomains;
|
|||||||
import nu.marginalia.loading.loader.SqlLoadProcessedDomain;
|
import nu.marginalia.loading.loader.SqlLoadProcessedDomain;
|
||||||
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
@ -48,7 +48,7 @@ class SqlLoadProcessedDomainTest {
|
|||||||
@Test
|
@Test
|
||||||
public void loadProcessedDomain() {
|
public void loadProcessedDomain() {
|
||||||
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
|
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
|
||||||
loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1");
|
loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1");
|
||||||
}
|
}
|
||||||
@Test
|
@Test
|
||||||
public void loadDomainAlias() {
|
public void loadDomainAlias() {
|
||||||
|
41
code/features/adblock/build.gradle
Normal file
41
code/features/adblock/build.gradle
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id "io.freefair.lombok" version "5.3.3.3"
|
||||||
|
|
||||||
|
id "de.undercouch.download" version "5.1.0"
|
||||||
|
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
implementation libs.guice
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.jsoup
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
||||||
|
|
||||||
|
task fastTests(type: Test) {
|
||||||
|
useJUnitPlatform {
|
||||||
|
excludeTags "slow"
|
||||||
|
}
|
||||||
|
}
|
8
code/features/adblock/readme.md
Normal file
8
code/features/adblock/readme.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Adblock
|
||||||
|
|
||||||
|
Contains an adblock simulator that reads an adblock specifications file and
|
||||||
|
uses it to identify if a document has ads.
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [AdblockSimulator](src/main/java/nu/marginalia/adblock/AdblockSimulator.java)
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.processor.logic.topic;
|
package nu.marginalia.adblock;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.processor.logic.topic;
|
package nu.marginalia.adblock;
|
||||||
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
@ -2,7 +2,7 @@ package nu.marginalia.ranking.data;
|
|||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@ -10,7 +10,7 @@ public class RankingDomainData {
|
|||||||
public final int id;
|
public final int id;
|
||||||
public final String name;
|
public final String name;
|
||||||
private int alias;
|
private int alias;
|
||||||
public EdgeDomainIndexingState state;
|
public DomainIndexingState state;
|
||||||
public final int knownUrls;
|
public final int knownUrls;
|
||||||
|
|
||||||
public int resolveAlias() {
|
public int resolveAlias() {
|
||||||
@ -23,10 +23,10 @@ public class RankingDomainData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean isSpecial() {
|
public boolean isSpecial() {
|
||||||
return EdgeDomainIndexingState.SPECIAL == state;
|
return DomainIndexingState.SPECIAL == state;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isSocialMedia() {
|
public boolean isSocialMedia() {
|
||||||
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
|
return DomainIndexingState.SOCIAL_MEDIA == state;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,8 +3,8 @@ package nu.marginalia.ranking.data;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -15,13 +15,13 @@ import java.util.function.IntConsumer;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class RankingDomainFetcher {
|
public class RankingDomainFetcher {
|
||||||
protected final HikariDataSource dataSource;
|
protected final HikariDataSource dataSource;
|
||||||
protected final EdgeDomainBlacklistImpl blacklist;
|
protected final DomainBlacklistImpl blacklist;
|
||||||
protected final Logger logger = LoggerFactory.getLogger(getClass());
|
protected final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
protected boolean getNames = false;
|
protected boolean getNames = false;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
public RankingDomainFetcher(HikariDataSource dataSource, DomainBlacklistImpl blacklist) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
}
|
}
|
||||||
@ -66,7 +66,7 @@ public class RankingDomainFetcher {
|
|||||||
new RankingDomainData(id,
|
new RankingDomainData(id,
|
||||||
rsp.getString(2),
|
rsp.getString(2),
|
||||||
rsp.getInt(3),
|
rsp.getInt(3),
|
||||||
EdgeDomainIndexingState.valueOf(rsp.getString(4)),
|
DomainIndexingState.valueOf(rsp.getString(4)),
|
||||||
rsp.getInt(5)));
|
rsp.getInt(5)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.ranking.data;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
@ -14,7 +14,7 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher
|
|||||||
final boolean hasData;
|
final boolean hasData;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, DomainBlacklistImpl blacklist) {
|
||||||
super(dataSource, blacklist);
|
super(dataSource, blacklist);
|
||||||
|
|
||||||
hasData = isDomainNeighborTablePopulated(dataSource);
|
hasData = isDomainNeighborTablePopulated(dataSource);
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.ranking.tool;
|
|||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
|
||||||
import nu.marginalia.ranking.StandardPageRank;
|
import nu.marginalia.ranking.StandardPageRank;
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||||
@ -32,7 +32,7 @@ public class CreateBrowseDomainRanksTool {
|
|||||||
|
|
||||||
logger.info("Ranking");
|
logger.info("Ranking");
|
||||||
var ds = new DatabaseModule().provideConnection();
|
var ds = new DatabaseModule().provideConnection();
|
||||||
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
var domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds));
|
||||||
var rpr = new StandardPageRank(domains, args);
|
var rpr = new StandardPageRank(domains, args);
|
||||||
|
|
||||||
uploader.start();
|
uploader.start();
|
||||||
|
@ -13,7 +13,7 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.ranking.RankingAlgorithm;
|
import nu.marginalia.ranking.RankingAlgorithm;
|
||||||
import nu.marginalia.ranking.data.RankingDomainData;
|
import nu.marginalia.ranking.data.RankingDomainData;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -49,7 +49,7 @@ public class PerusePageRankV2 {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void main(String... args) {
|
public static void main(String... args) {
|
||||||
var ds = new DatabaseModule().provideConnection();
|
var ds = new DatabaseModule().provideConnection();
|
||||||
var blacklist = new EdgeDomainBlacklistImpl(ds);
|
var blacklist = new DomainBlacklistImpl(ds);
|
||||||
var rank = new PerusePageRankV2(new RankingDomainFetcher(ds, blacklist));
|
var rank = new PerusePageRankV2(new RankingDomainFetcher(ds, blacklist));
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.ranking.tool;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
|
||||||
import nu.marginalia.ranking.StandardPageRank;
|
import nu.marginalia.ranking.StandardPageRank;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
@ -35,11 +35,11 @@ public class PrintDomainRanksTool {
|
|||||||
|
|
||||||
RankingDomainFetcher domains;
|
RankingDomainFetcher domains;
|
||||||
if (Boolean.getBoolean("use-link-data")) {
|
if (Boolean.getBoolean("use-link-data")) {
|
||||||
domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
domains = new RankingDomainFetcher(ds, new DomainBlacklistImpl(ds));
|
||||||
domains.retainNames();
|
domains.retainNames();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds));
|
||||||
domains.retainNames();
|
domains.retainNames();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import nu.marginalia.ranking.StandardPageRank;
|
|||||||
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||||
|
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.model.dbcommon.DomainBlacklistImpl;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -33,7 +33,7 @@ public class UpdateDomainRanksTool {
|
|||||||
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
||||||
|
|
||||||
logger.info("Ranking");
|
logger.info("Ranking");
|
||||||
var domains = new RankingDomainFetcherForSimilarityData(conn, new EdgeDomainBlacklistImpl(conn));
|
var domains = new RankingDomainFetcherForSimilarityData(conn, new DomainBlacklistImpl(conn));
|
||||||
var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com");
|
var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com");
|
||||||
|
|
||||||
rankMax = rpr.size();
|
rankMax = rpr.size();
|
||||||
|
44
code/features/pubdate/build.gradle
Normal file
44
code/features/pubdate/build.gradle
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id "io.freefair.lombok" version "5.3.3.3"
|
||||||
|
|
||||||
|
id "de.undercouch.download" version "5.1.0"
|
||||||
|
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:crawl:common')
|
||||||
|
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
implementation libs.guice
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.gson
|
||||||
|
implementation libs.jsoup
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
testImplementation project(':code:common:config')
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
||||||
|
|
||||||
|
task fastTests(type: Test) {
|
||||||
|
useJUnitPlatform {
|
||||||
|
excludeTags "slow"
|
||||||
|
}
|
||||||
|
}
|
7
code/features/pubdate/readme.md
Normal file
7
code/features/pubdate/readme.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# Pubdate
|
||||||
|
|
||||||
|
Contains advanced haruspicy for figuring out when a document was published.
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [PubDateSniffer](src/main/java/nu/marginalia/pubdate/PubDateSniffer.java)
|
@ -0,0 +1,6 @@
|
|||||||
|
package nu.marginalia.pubdate;
|
||||||
|
|
||||||
|
public enum PubDateEffortLevel {
|
||||||
|
LOW,
|
||||||
|
HIGH
|
||||||
|
}
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate;
|
package nu.marginalia.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
@ -9,5 +9,5 @@ import java.util.Optional;
|
|||||||
|
|
||||||
public interface PubDateHeuristic {
|
public interface PubDateHeuristic {
|
||||||
|
|
||||||
Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard);
|
Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
|
||||||
}
|
}
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate;
|
package nu.marginalia.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
|
|
||||||
import java.time.DateTimeException;
|
import java.time.DateTimeException;
|
||||||
@ -122,7 +122,7 @@ public class PubDateParser {
|
|||||||
return (max + min) / 2;
|
return (max + min) / 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int guessYear(EdgeHtmlStandard standard) {
|
public static int guessYear(HtmlStandard standard) {
|
||||||
// Create some jitter to avoid having documents piling up in the same four years
|
// Create some jitter to avoid having documents piling up in the same four years
|
||||||
// as this would make searching in those years disproportionately useless
|
// as this would make searching in those years disproportionately useless
|
||||||
|
|
@ -1,9 +1,9 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate;
|
package nu.marginalia.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.heuristic.*;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.pubdate.heuristic.*;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -36,7 +36,7 @@ public class PubDateSniffer {
|
|||||||
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
||||||
}
|
}
|
||||||
|
|
||||||
public PubDate getPubDate(String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard, boolean runExpensive) {
|
public PubDate getPubDate(String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
|
||||||
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
||||||
|
|
||||||
for (var heuristic : heuristics) {
|
for (var heuristic : heuristics) {
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@ -18,7 +18,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
if (effortLevel == PubDateEffortLevel.LOW)
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
@ -32,9 +32,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
|
|
||||||
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
||||||
public PubDate pubDate;
|
public PubDate pubDate;
|
||||||
private final EdgeHtmlStandard htmlStandard;
|
private final HtmlStandard htmlStandard;
|
||||||
|
|
||||||
private DateExtractingNodeVisitorPass(EdgeHtmlStandard htmlStandard) {
|
private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
|
||||||
this.htmlStandard = htmlStandard;
|
this.htmlStandard = htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void parse(String text) {
|
private void parse(String text) {
|
||||||
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) {
|
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstring(text)
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
@ -1,10 +1,10 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@ -17,7 +17,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
if (effortLevel == PubDateEffortLevel.LOW)
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
@ -31,9 +31,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
|
|
||||||
private static class DateExtractingNodeVisitor implements NodeFilter {
|
private static class DateExtractingNodeVisitor implements NodeFilter {
|
||||||
public PubDate pubDate;
|
public PubDate pubDate;
|
||||||
private final EdgeHtmlStandard htmlStandard;
|
private final HtmlStandard htmlStandard;
|
||||||
|
|
||||||
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) {
|
private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
|
||||||
this.htmlStandard = htmlStandard;
|
this.htmlStandard = htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -71,7 +71,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void parse(String text) {
|
private void parse(String text) {
|
||||||
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) {
|
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstring(text)
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
@ -0,0 +1,23 @@
|
|||||||
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
|
if (htmlStandard == HtmlStandard.UNKNOWN)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -13,7 +13,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
// HTML5, alternative approach
|
// HTML5, alternative approach
|
||||||
for (var tag : document.select("time")) {
|
for (var tag : document.select("time")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -13,7 +13,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
// HTML5
|
// HTML5
|
||||||
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -13,7 +13,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
if (maybeDate.isPresent()) {
|
if (maybeDate.isPresent()) {
|
@ -1,14 +1,14 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
import com.google.gson.JsonSyntaxException;
|
import com.google.gson.JsonSyntaxException;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -16,7 +16,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
||||||
var maybeDate = parseLdJson(tag.data())
|
var maybeDate = parseLdJson(tag.data())
|
||||||
.flatMap(PubDateParser::attemptParseDate);
|
.flatMap(PubDateParser::attemptParseDate);
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -13,7 +13,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
String lmString = "last-modified: ";
|
String lmString = "last-modified: ";
|
||||||
int offset = headers.toLowerCase().indexOf(lmString);
|
int offset = headers.toLowerCase().indexOf(lmString);
|
||||||
|
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -13,7 +13,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
|
|
||||||
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
@ -1,10 +1,10 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
// OG
|
// OG
|
||||||
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
@ -1,10 +1,10 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
if (maybeDate.isPresent()) {
|
if (maybeDate.isPresent()) {
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -20,7 +20,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
|
|||||||
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
final String urlString = url.path;
|
final String urlString = url.path;
|
||||||
|
|
||||||
var matcher = yearUrlPattern.matcher(urlString);
|
var matcher = yearUrlPattern.matcher(urlString);
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.converting.processor.logic.pubdate.heuristic;
|
package nu.marginalia.pubdate.heuristic;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -17,7 +17,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
|||||||
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||||
final String urlString = url.path;
|
final String urlString = url.path;
|
||||||
|
|
||||||
var matcher = yearUrlPattern.matcher(urlString);
|
var matcher = yearUrlPattern.matcher(urlString);
|
@ -1,11 +1,9 @@
|
|||||||
package nu.marginalia.converting.logic;
|
package nu.marginalia.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateParser;
|
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.PubDateSniffer;
|
|
||||||
import nu.marginalia.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||||
|
import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@ -75,7 +73,7 @@ class PubDateSnifferTest {
|
|||||||
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), EdgeHtmlStandard.UNKNOWN, true);
|
"""), HtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@ -91,7 +89,7 @@ class PubDateSnifferTest {
|
|||||||
<time>2022-08-24</time>
|
<time>2022-08-24</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), EdgeHtmlStandard.UNKNOWN, true);
|
"""), HtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@ -107,7 +105,7 @@ class PubDateSnifferTest {
|
|||||||
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), EdgeHtmlStandard.UNKNOWN, true);
|
"""), HtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2006, ret.year());
|
assertEquals(2006, ret.year());
|
||||||
@ -117,14 +115,14 @@ class PubDateSnifferTest {
|
|||||||
public void testProblemCases() throws IOException, URISyntaxException {
|
public void testProblemCases() throws IOException, URISyntaxException {
|
||||||
var ret = dateSniffer.getPubDate("",
|
var ret = dateSniffer.getPubDate("",
|
||||||
new EdgeUrl("https://www.example.com/"),
|
new EdgeUrl("https://www.example.com/"),
|
||||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true);
|
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2006, ret.year());
|
assertEquals(2006, ret.year());
|
||||||
|
|
||||||
ret = dateSniffer.getPubDate("",
|
ret = dateSniffer.getPubDate("",
|
||||||
new EdgeUrl("https://www.example.com/"),
|
new EdgeUrl("https://www.example.com/"),
|
||||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true);
|
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2010, ret.year());
|
assertEquals(2010, ret.year());
|
||||||
@ -147,7 +145,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<meta itemprop="datePublished" content="2022-08-24" />
|
<meta itemprop="datePublished" content="2022-08-24" />
|
||||||
"""), EdgeHtmlStandard.UNKNOWN, true);
|
"""), HtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@ -161,7 +159,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<meta property="datePublished" content="2022-08-24" />
|
<meta property="datePublished" content="2022-08-24" />
|
||||||
"""),EdgeHtmlStandard.UNKNOWN, true);
|
"""), HtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@ -175,7 +173,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
||||||
"""), EdgeHtmlStandard.UNKNOWN, true);
|
"""), HtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2004-08-24", ret.dateIso8601());
|
assertEquals("2004-08-24", ret.dateIso8601());
|
||||||
@ -189,7 +187,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<title>No date in the HTML</title>
|
<title>No date in the HTML</title>
|
||||||
"""), EdgeHtmlStandard.UNKNOWN, true);
|
"""), HtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
@ -204,7 +202,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<title>No date in the HTML</title>
|
<title>No date in the HTML</title>
|
||||||
"""), EdgeHtmlStandard.UNKNOWN, true);
|
"""), HtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-02-03", ret.dateIso8601());
|
assertEquals("2022-02-03", ret.dateIso8601());
|
||||||
@ -219,7 +217,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<p>Published 2003, updated 2022</p>
|
<p>Published 2003, updated 2022</p>
|
||||||
"""), EdgeHtmlStandard.HTML5, true);
|
"""), HtmlStandard.HTML5, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
@ -245,7 +243,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
||||||
"""), EdgeHtmlStandard.UNKNOWN, true);
|
"""), HtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
@ -13,7 +13,6 @@ java {
|
|||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
implementation project(':code:libraries:misc')
|
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.query_parser;
|
|||||||
import nu.marginalia.language.WordPatterns;
|
import nu.marginalia.language.WordPatterns;
|
||||||
import nu.marginalia.query_parser.token.Token;
|
import nu.marginalia.query_parser.token.Token;
|
||||||
import nu.marginalia.query_parser.token.TokenType;
|
import nu.marginalia.query_parser.token.TokenType;
|
||||||
import nu.marginalia.util.TransformList;
|
import nu.marginalia.transform_list.TransformList;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.transform_list;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.function.BiConsumer;
|
import java.util.function.BiConsumer;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.transform_list;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
@ -5,7 +5,7 @@ import com.google.inject.Singleton;
|
|||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.browse.model.BrowseResult;
|
import nu.marginalia.browse.model.BrowseResult;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklist;
|
import nu.marginalia.model.dbcommon.DomainBlacklist;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -23,7 +23,7 @@ public class DbBrowseDomainsRandom {
|
|||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist, int set) {
|
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||||
|
|
||||||
final String q = """
|
final String q = """
|
||||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||||
|
@ -5,7 +5,7 @@ import com.google.inject.Singleton;
|
|||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.browse.model.BrowseResult;
|
import nu.marginalia.browse.model.BrowseResult;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklist;
|
import nu.marginalia.model.dbcommon.DomainBlacklist;
|
||||||
import nu.marginalia.model.id.EdgeId;
|
import nu.marginalia.model.id.EdgeId;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -24,7 +24,7 @@ public class DbBrowseDomainsSimilarCosine {
|
|||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<BrowseResult> getDomainNeighborsAdjacentCosine(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
|
public List<BrowseResult> getDomainNeighborsAdjacentCosine(EdgeId<EdgeDomain> domainId, DomainBlacklist blacklist, int count) {
|
||||||
List<BrowseResult> domains = new ArrayList<>(count);
|
List<BrowseResult> domains = new ArrayList<>(count);
|
||||||
|
|
||||||
String q = """
|
String q = """
|
||||||
|
@ -6,7 +6,7 @@ import com.zaxxer.hikari.HikariDataSource;
|
|||||||
import nu.marginalia.browse.model.BrowseResult;
|
import nu.marginalia.browse.model.BrowseResult;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.dbcommon.EdgeDomainBlacklist;
|
import nu.marginalia.model.dbcommon.DomainBlacklist;
|
||||||
import nu.marginalia.model.id.EdgeId;
|
import nu.marginalia.model.id.EdgeId;
|
||||||
import nu.marginalia.model.id.EdgeIdCollection;
|
import nu.marginalia.model.id.EdgeIdCollection;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -26,7 +26,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
|
|||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
|
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, DomainBlacklist blacklist, int count) {
|
||||||
final Set<BrowseResult> domains = new HashSet<>(count*3);
|
final Set<BrowseResult> domains = new HashSet<>(count*3);
|
||||||
|
|
||||||
final String q = """
|
final String q = """
|
||||||
@ -131,7 +131,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
|
|||||||
return new ArrayList<>(domains);
|
return new ArrayList<>(domains);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist, int set) {
|
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||||
|
|
||||||
final String q = """
|
final String q = """
|
||||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user