(restructure) Clean up repo by moving stray features into converter-process and crawler-process

This commit is contained in:
Viktor Lofgren 2024-07-30 10:04:59 +02:00
parent 7e4efa45b8
commit 80900107f7
334 changed files with 369 additions and 500 deletions

View File

@ -44,8 +44,8 @@ subprojects.forEach {it ->
}
ext {
jvmVersion=21
dockerImageBase='container-registry.oracle.com/graalvm/jdk:21@sha256:1fd33d4d4eba3a9e1a41a728e39ea217178d257694eea1214fec68d2ed4d3d9b'
jvmVersion=22
dockerImageBase='container-registry.oracle.com/graalvm/jdk:22'
dockerImageTag='latest'
dockerImageRegistry='marginalia'
}

View File

@ -40,10 +40,8 @@ dependencies {
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:data-extractors')
implementation project(':code:features-convert:stackexchange-xml')
implementation project(':code:features-convert:reddit-json')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation project(':code:execution:data-extractors')
implementation project(':code:index:index-journal')
implementation project(':code:index:api')
implementation project(':code:processes:process-mq-api')

View File

@ -22,8 +22,8 @@ dependencies {
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation project(':code:processes:converting-process:ft-anchor-keywords')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:converting-process')
implementation project(':third-party:commons-codec')

View File

@ -1,33 +0,0 @@
plugins {
id 'java'
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation libs.bundles.slf4j
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.notnull
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -1,8 +0,0 @@
# Adblock
Contains an adblock simulator that reads an adblock specifications file and
uses it to identify if a document has ads.
## Central Classes
* [AdblockSimulator](java/nu/marginalia/adblock/AdblockSimulator.java)

View File

@ -1,34 +0,0 @@
plugins {
id 'java'
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:model')
implementation libs.bundles.slf4j
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.notnull
implementation libs.bundles.gson
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:common:config')
}

View File

@ -1,7 +0,0 @@
# Pubdate
Contains advanced haruspicy for figuring out when a document was published.
## Central Classes
* [PubDateSniffer](java/nu/marginalia/pubdate/PubDateSniffer.java)

View File

@ -1,13 +0,0 @@
# Converter Features
## Major features
* [keyword-extraction](keyword-extraction/) - Identifies keywords to index in a document
* [summary-extraction](summary-extraction/) - Generate an excerpt/quote from a website to display on the search results page.
## Smaller features:
* [adblock](adblock/) - Simulates Adblock
* [pubdate](pubdate/) - Determines when a document was published
* [topic-detection](topic-detection/) - Tries to identify the topic of a website

View File

@ -1,44 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:common:model')
implementation libs.notnull
implementation libs.jsoup
implementation libs.sqlite
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.guava
implementation libs.gson
implementation libs.zstd
implementation libs.trove
implementation libs.commons.compress
implementation libs.xz
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
maxHeapSize = "8G"
useJUnitPlatform()
}

View File

@ -1,43 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:common:model')
implementation libs.notnull
implementation libs.jsoup
implementation libs.sqlite
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.guava
implementation libs.zstd
implementation libs.trove
implementation libs.commons.compress
implementation libs.xz
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
maxHeapSize = "8G"
useJUnitPlatform()
}

View File

@ -1,18 +0,0 @@
Stackexchange's data is a jumble of questions and answers,
where the answers refer to the questions with a parentId field.
e.g.
```xml
<?xml version="1.0" encoding="utf-8"?>
<posts>
<row Id="1" PostTypeId="1" AcceptedAnswerId="51" CreationDate="2016-01-12T18:45:19.963" Score="10" ViewCount="424" Body="&lt;p&gt;When I've printed an object I've had to choose between high resolution and quick prints. What techniques or technologies can I use or deploy to speed up my high resolution prints?&lt;/p&gt;&#xA;" OwnerUserId="16" LastActivityDate="2017-10-31T02:31:08.560" Title="How to obtain high resolution prints in a shorter period of time?" Tags="&lt;resolution&gt;&lt;speed&gt;&lt;quality&gt;" AnswerCount="2" CommentCount="6" ContentLicense="CC BY-SA 3.0" />
<row Id="2" PostTypeId="1" AcceptedAnswerId="12" CreationDate="2016-01-12T18:45:51.287" Score="34" ViewCount="7377" Body="&lt;p&gt;I would like to buy a 3D printer, but I'm concerned about the health risks that are associated with its operation. Some groups of scientists say it can be &lt;a href=&quot;http://www.techworld.com/news/personal-tech/scientists-warn-of-3d-printing-health-effects-as-tech-hits-high-street-3460992/&quot;&gt;harmful&lt;/a&gt; for humans.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;What do I need to consider before buying a 3D printer if I care about my health? Are there any safe printers?&lt;/p&gt;&#xA;" OwnerUserId="20" LastEditorUserId="334" LastEditDate="2016-11-15T16:16:11.163" LastActivityDate="2019-06-10T23:18:34.190" Title="Is 3D printing safe for your health?" Tags="&lt;print-material&gt;&lt;safety&gt;&lt;health&gt;" AnswerCount="4" CommentCount="1" ContentLicense="CC BY-SA 3.0" />
<row Id="12" PostTypeId="2" ParentId="2" CreationDate="2016-01-12T19:13:00.710" Score="23" Body="&lt;p&gt;There is very little information about safety available, as home 3D printers are relatively new. However, plastics such as ABS have a long history in making plastic products, and a study found..." />
</posts>
```
Since the search engine wants to extract keywords for each thread
holistically, not by question or answer, it is necessary to re-arrange
the data (which is very large). SQLite does a decent job of enabling
this task.

View File

@ -1,42 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.jsoup
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.guava
implementation libs.bundles.gson
implementation libs.trove
implementation libs.fastutil
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:features-convert:keyword-extraction')
testImplementation project(':code:libraries:language-processing')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:common:config')
testImplementation project(':code:common:model')
}

View File

@ -1,25 +0,0 @@
# Summary Extraction
This feature attempts to find a descriptive passage of text that summarizes
what a search result "is about". It's the text you see below a search result.
It must solve two problems:
1. Identify which part of the document that contains "the text".
The crux is that the document may be anywhere from 1993 to the present, with era-appropriate
formatting. It may be formatted with &lt;center&gt;ed &lt;font&gt;-tags, or semantic HTML5.
2. Identify which part of "the text" best describes the document.
It uses several naive heuristics to try to find something that makes sense,
and there is probably room for improvement.
There are many good techniques for doing this, but they've sadly not proved
particularly fast. Whatever solution is used needs to be able to summarize of
order of a 100,000,000 documents with a time budget of a couple of hours.
## Central Classes
* [SummaryExtractor](java/nu/marginalia/summary/SummaryExtractor.java)

View File

@ -1,34 +0,0 @@
plugins {
id 'java'
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation project(':code:libraries:language-processing')
implementation project(':third-party:porterstemmer')
implementation libs.bundles.slf4j
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.notnull
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -1,4 +0,0 @@
# Topic Detection
This is an experiment in using hand-crafted naive bayesian filters to detecting the topic of a website.
It's noteworthy it detects recipes very well.

View File

@ -1,8 +0,0 @@
# Crawl Features
These are bits of search-engine related code that are relatively isolated pieces of business logic,
that benefit from the clarity of being kept separate from the rest of the crawling code.
* [content-type](content-type/) - Content Type identification
* [crawl-blocklist](crawl-blocklist/) - IP and URL blocklists
* [link-parser](link-parser/) - Code for parsing and normalizing links

View File

@ -31,7 +31,7 @@ dependencies {
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:processes:converting-process:ft-keyword-extraction')
implementation libs.bundles.slf4j

View File

@ -1,5 +1,7 @@
plugins {
id 'java'
id 'application'
id 'org.graalvm.buildtools.native' version '0.10.2'
}
java {
@ -9,7 +11,51 @@ java {
}
apply from: "$rootProject.projectDir/srcsets.gradle"
sourceSets {
main {
java {
srcDirs = [
'java',
'build/generated/source/proto/main/grpc',
'build/generated/source/proto/main/java'
]
}
resources {
srcDirs = [ 'resources' ]
}
}
test {
java {
srcDirs = [ 'test' ]
}
resources {
srcDirs = [ 'test-resources' ]
}
}
demo {
java {
srcDirs = [ 'demo' ]
}
resources {
srcDirs = [ 'demo-resources' ]
}
}
}
application {
mainClass = 'demo.OneBillionRowsDemo'
}
graalvmNative {
binaries.all {
resources.autodetect()
buildArgs=['-H:+ForeignAPISupport', '-H:+UnlockExperimentalVMOptions']
}
toolchainDetection = false
}
dependencies {
implementation libs.bundles.slf4j
@ -24,7 +70,14 @@ dependencies {
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation libs.sqlite
demoImplementation sourceSets.main.output
demoImplementation libs.bundles.slf4j
demoImplementation libs.notnull
demoImplementation libs.commons.lang3
demoImplementation libs.lz4
demoImplementation libs.commons.compress
demoImplementation libs.zstd
demoImplementation libs.duckdb
}
test {

View File

@ -0,0 +1,146 @@
# Slop
Slop is a library for columnar data persistence. It is designed to be used for storing large amounts of data in a way
that is both fast and memory-efficient. The data is write-once, and the slop library offers many facilities for
deciding how it should be stored and accessed.
Slop is designed as a low abstraction what-you-see-is-what-you-do library, the reason for
this is to be able to eliminate copies and other overheads that are common in higher
level libraries. The intent is to get the performance of a hand-rolled solution, but
without the complexity and brittleness that comes with hand-rolling an ad-hoc row-based storage
format.
A lot of what would commonly be kept in a schema description is instead just
implemented as code. To aid with portability, slop stores schema information
in the file names of the data files, besides the actual name of the column itself.
A table of demographic information may end up stored in files like this:
```text
cities.0.dat.s8[].gz
cities.0.dat-len.varint-le.bin
population.0.dat.s32le.bin
average-age.0.dat.f64le.gz
```
The slop library offers some facilities to aid with data integrity, such as the SlopTable
class, which is a wrapper that ensures consistent positions for a group of columns, and aids
in closing the columns when they are no longer needed.
## Why though?
Slop is fast.
Depending on compression and encoding choices, it's possible
to get read speeds that are 5-20x faster than reading from a sqlite database.
When compression is disabled, Slop will memory map the data, and depending on the
contents of the column, it's possible to perform zero copy reads.
Slop is compact.
Depending on compression and encoding choices, the format will be smaller
than a parquet file containing the equivalent information.
Slop is simple.
There isn't much magic going on under the hood in Slop. It's designed with the philosophy that a competent programmer
should be able to reverse engineer the format of the data by just
looking at a directory listing of the data files.
### Relaxed 1BRC (no CSV ingestion time)
Slop is reasonably competitive with DuckDB in terms of read speed,
especially when reading from Parquet, and the data on disk tends
to be smaller.
This is noteworthy given Slop is a single-threaded JVM application,
and DuckDB is a multi-threaded C++ application.
| Impl | Runtime | Size On Disk |
|----------------------------|---------|--------------|
| DuckDB in memory | 2.6s | 3.0 GB |
| Slop in vanilla Java s16 | 4.2s | 2.8 GB |
| Slop in vanilla Java s32 | 4.5s | 3.8 GB |
| Parquet (Snappy) in DuckDB | 4.5s | 5.5 GB |
| Parquet (Zstd) in DuckDB | 5.5s | 3.0 GB |
## Example
With slop it's desirable to keep the schema information in the code. This is an example of how you might use slop to
store a table of data with three columns: source, dest, and counts. The source and dest columns are strings, and the
counts column is an integer that's stored wit a varint-coding (i.e. like how utf-8 works).
The data is stored in a directory, and the data is written and read using the `MyData.Writer` and `MyData.Reader` classes.
The `MyData` class is itself is a record, and the schema is stored as static fields in the `MyData` class.
```java
record Population(String city, int population, double avgAge) {
private static final ColumnDesc<StringColumnReader, StringColumnWriter> citiesColumn =
new ColumnDesc<>("cities", ColumnType.STRING, StorageType.GZIP);
private static final ColumnDesc<IntColumnReader, IntColumnWriter> populationColumn =
new ColumnDesc<>("population", ColumnType.INT_LE, StorageType.PLAIN);
private static final ColumnDesc<DoubleColumnReader, DoubleColumnWriter> averageAgeColumnn =
new ColumnDesc<>("average-age", ColumnType.DOUBLE_LE, StorageType.PLAIN);
public static class Writer extends SlopTable {
private final StringColumnWriter citiesWriter;
private final IntColumnWriter populationWriter;
private final DoubleColumnWriter avgAgeWriter;
public Writer(Path baseDir) throws IOException {
citiesWriter = citiesColumn.create(this, baseDir);
populationWriter = populationColumn.create(this, baseDir);
avgAgeWriter = averageAgeColumnn.create(this, baseDir);
}
public void write(Population data) throws IOException {
citiesWriter.put(data.city);
populationWriter.put(data.population);
avgAgeWriter.put(data.avgAge);
}
}
public static class Reader extends SlopTable {
private final StringColumnReader citiesReader;
private final IntColumnReader populationReader;
private final DoubleColumnReader avgAgeReader;
public Reader(Path baseDir) throws IOException {
citiesReader = citiesColumn.open(this, baseDir);
populationReader = populationColumn.open(this, baseDir);
avgAgeReader = averageAgeColumnn.open(this, baseDir);
}
public boolean hasRemaining() throws IOException {
return citiesReader.hasRemaining();
}
public Population read() throws IOException {
return new Population(
citiesReader.get(),
populationReader.get(),
avgAgeReader.get()
);
}
}
}
```
## Nested Records
TBW
## Column Types
TBW
## Storage Types
TBW
## Extension
TBW

View File

@ -47,18 +47,12 @@ dependencies {
implementation project(':code:processes:converting-process:model')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:features-convert:adblock')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:features-convert:topic-detection')
implementation project(':code:features-convert:pubdate')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:features-convert:summary-extraction')
implementation project(':code:features-convert:stackexchange-xml')
implementation project(':code:features-convert:reddit-json')
implementation project(':code:processes:converting-process:ft-anchor-keywords')
implementation project(':code:processes:converting-process:ft-keyword-extraction')
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:content-type')
implementation project(':code:processes:crawling-process:ft-crawl-blocklist')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation project(':code:processes:crawling-process:ft-content-type')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:processes:crawling-process:model')

View File

@ -17,7 +17,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:common:process')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:processes:converting-process:ft-keyword-extraction')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')

View File

@ -1,4 +1,4 @@
package nu.marginalia.adblock;
package nu.marginalia.converting.processor.classifier.adblock;
import com.google.inject.Inject;
import com.google.inject.Singleton;

View File

@ -1,4 +1,4 @@
package nu.marginalia.adblock;
package nu.marginalia.converting.processor.classifier.adblock;
import org.jsoup.nodes.Document;

View File

@ -1,4 +1,4 @@
package nu.marginalia.topic;
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;

View File

@ -1,4 +1,4 @@
package nu.marginalia.topic;
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;

View File

@ -1,4 +1,4 @@
package nu.marginalia.topic;
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;

View File

@ -2,14 +2,14 @@ package nu.marginalia.converting.processor.logic;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector;
import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

View File

@ -12,6 +12,7 @@ import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
import nu.marginalia.converting.processor.logic.links.FileLinks;
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecializations;
import nu.marginalia.converting.processor.pubdate.PubDateSniffer;
import nu.marginalia.gregex.GuardedRegex;
import nu.marginalia.gregex.GuardedRegexFactory;
import nu.marginalia.keyword.DocumentKeywordExtractor;
@ -29,7 +30,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.pubdate.PubDateSniffer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;

View File

@ -6,7 +6,7 @@ import com.google.inject.Singleton;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.apache.logging.log4j.util.Strings;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

View File

@ -3,7 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import java.util.ArrayList;

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -4,7 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -3,7 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -4,7 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -1,4 +1,4 @@
package nu.marginalia.pubdate;
package nu.marginalia.converting.processor.pubdate;
public enum PubDateEffortLevel {
LOW,

View File

@ -1,4 +1,4 @@
package nu.marginalia.pubdate;
package nu.marginalia.converting.processor.pubdate;
import nu.marginalia.model.html.HtmlStandard;

View File

@ -1,4 +1,4 @@
package nu.marginalia.pubdate;
package nu.marginalia.converting.processor.pubdate;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;

View File

@ -1,7 +1,7 @@
package nu.marginalia.pubdate;
package nu.marginalia.converting.processor.pubdate;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import java.time.DateTimeException;
import java.time.LocalDate;

View File

@ -1,9 +1,9 @@
package nu.marginalia.pubdate;
package nu.marginalia.converting.processor.pubdate;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.processor.pubdate.heuristic.*;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.heuristic.*;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.ArrayList;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

View File

@ -1,12 +1,12 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateFromHtmlStandard;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,16 +1,16 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException;
import com.google.gson.annotations.SerializedName;
import lombok.ToString;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Collections;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,11 +1,11 @@
package nu.marginalia.pubdate.heuristic;
package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,8 +1,8 @@
package nu.marginalia.summary;
package nu.marginalia.converting.processor.summary;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.summary.heuristic.*;
import nu.marginalia.converting.processor.summary.heuristic.*;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;

View File

@ -1,4 +1,4 @@
package nu.marginalia.summary.heuristic;
package nu.marginalia.converting.processor.summary.heuristic;
import com.google.inject.Inject;
import com.google.inject.name.Named;

View File

@ -1,4 +1,4 @@
package nu.marginalia.summary.heuristic;
package nu.marginalia.converting.processor.summary.heuristic;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

View File

@ -1,4 +1,4 @@
package nu.marginalia.summary.heuristic;
package nu.marginalia.converting.processor.summary.heuristic;
import org.apache.commons.lang3.StringUtils;

Some files were not shown because too many files have changed in this diff Show More