mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(restructure) Clean up repo by moving stray features into converter-process and crawler-process
This commit is contained in:
parent
7e4efa45b8
commit
80900107f7
@ -44,8 +44,8 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion=21
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:21@sha256:1fd33d4d4eba3a9e1a41a728e39ea217178d257694eea1214fec68d2ed4d3d9b'
|
||||
jvmVersion=22
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:22'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
}
|
||||
|
@ -40,10 +40,8 @@ dependencies {
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-convert:data-extractors')
|
||||
implementation project(':code:features-convert:stackexchange-xml')
|
||||
implementation project(':code:features-convert:reddit-json')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:execution:data-extractors')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
|
@ -22,8 +22,8 @@ dependencies {
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-convert:anchor-keywords')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:processes:converting-process:ft-anchor-keywords')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':third-party:commons-codec')
|
@ -1,33 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
@ -1,8 +0,0 @@
|
||||
# Adblock
|
||||
|
||||
Contains an adblock simulator that reads an adblock specifications file and
|
||||
uses it to identify if a document has ads.
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [AdblockSimulator](java/nu/marginalia/adblock/AdblockSimulator.java)
|
@ -1,34 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
testImplementation project(':code:common:config')
|
||||
}
|
@ -1,7 +0,0 @@
|
||||
# Pubdate
|
||||
|
||||
Contains advanced haruspicy for figuring out when a document was published.
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [PubDateSniffer](java/nu/marginalia/pubdate/PubDateSniffer.java)
|
@ -1,13 +0,0 @@
|
||||
# Converter Features
|
||||
|
||||
## Major features
|
||||
|
||||
* [keyword-extraction](keyword-extraction/) - Identifies keywords to index in a document
|
||||
* [summary-extraction](summary-extraction/) - Generate an excerpt/quote from a website to display on the search results page.
|
||||
|
||||
|
||||
## Smaller features:
|
||||
|
||||
* [adblock](adblock/) - Simulates Adblock
|
||||
* [pubdate](pubdate/) - Determines when a document was published
|
||||
* [topic-detection](topic-detection/) - Tries to identify the topic of a website
|
@ -1,44 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:common:model')
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation libs.sqlite
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.guava
|
||||
implementation libs.gson
|
||||
implementation libs.zstd
|
||||
implementation libs.trove
|
||||
implementation libs.commons.compress
|
||||
implementation libs.xz
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform()
|
||||
}
|
@ -1,43 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:common:model')
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation libs.sqlite
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.guava
|
||||
implementation libs.zstd
|
||||
implementation libs.trove
|
||||
implementation libs.commons.compress
|
||||
implementation libs.xz
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform()
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
Stackexchange's data is a jumble of questions and answers,
|
||||
where the answers refer to the questions with a parentId field.
|
||||
|
||||
e.g.
|
||||
```xml
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<posts>
|
||||
<row Id="1" PostTypeId="1" AcceptedAnswerId="51" CreationDate="2016-01-12T18:45:19.963" Score="10" ViewCount="424" Body="<p>When I've printed an object I've had to choose between high resolution and quick prints. What techniques or technologies can I use or deploy to speed up my high resolution prints?</p>
" OwnerUserId="16" LastActivityDate="2017-10-31T02:31:08.560" Title="How to obtain high resolution prints in a shorter period of time?" Tags="<resolution><speed><quality>" AnswerCount="2" CommentCount="6" ContentLicense="CC BY-SA 3.0" />
|
||||
<row Id="2" PostTypeId="1" AcceptedAnswerId="12" CreationDate="2016-01-12T18:45:51.287" Score="34" ViewCount="7377" Body="<p>I would like to buy a 3D printer, but I'm concerned about the health risks that are associated with its operation. Some groups of scientists say it can be <a href="http://www.techworld.com/news/personal-tech/scientists-warn-of-3d-printing-health-effects-as-tech-hits-high-street-3460992/">harmful</a> for humans.</p>

<p>What do I need to consider before buying a 3D printer if I care about my health? Are there any safe printers?</p>
" OwnerUserId="20" LastEditorUserId="334" LastEditDate="2016-11-15T16:16:11.163" LastActivityDate="2019-06-10T23:18:34.190" Title="Is 3D printing safe for your health?" Tags="<print-material><safety><health>" AnswerCount="4" CommentCount="1" ContentLicense="CC BY-SA 3.0" />
|
||||
<row Id="12" PostTypeId="2" ParentId="2" CreationDate="2016-01-12T19:13:00.710" Score="23" Body="<p>There is very little information about safety available, as home 3D printers are relatively new. However, plastics such as ABS have a long history in making plastic products, and a study found..." />
|
||||
</posts>
|
||||
```
|
||||
|
||||
Since the search engine wants to extract keywords for each thread
|
||||
holistically, not by question or answer, it is necessary to re-arrange
|
||||
the data (which is very large). SQLite does a decent job of enabling
|
||||
this task.
|
||||
|
@ -1,42 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.jsoup
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.guava
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation project(':code:features-convert:keyword-extraction')
|
||||
testImplementation project(':code:libraries:language-processing')
|
||||
testImplementation project(':code:libraries:term-frequency-dict')
|
||||
testImplementation project(':code:common:config')
|
||||
testImplementation project(':code:common:model')
|
||||
}
|
||||
|
@ -1,25 +0,0 @@
|
||||
# Summary Extraction
|
||||
|
||||
This feature attempts to find a descriptive passage of text that summarizes
|
||||
what a search result "is about". It's the text you see below a search result.
|
||||
|
||||
It must solve two problems:
|
||||
|
||||
1. Identify which part of the document that contains "the text".
|
||||
The crux is that the document may be anywhere from 1993 to the present, with era-appropriate
|
||||
formatting. It may be formatted with <center>ed <font>-tags, or semantic HTML5.
|
||||
|
||||
2. Identify which part of "the text" best describes the document.
|
||||
|
||||
It uses several naive heuristics to try to find something that makes sense,
|
||||
and there is probably room for improvement.
|
||||
|
||||
There are many good techniques for doing this, but they've sadly not proved
|
||||
particularly fast. Whatever solution is used needs to be able to summarize of
|
||||
order of a 100,000,000 documents with a time budget of a couple of hours.
|
||||
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [SummaryExtractor](java/nu/marginalia/summary/SummaryExtractor.java)
|
||||
|
@ -1,34 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
# Topic Detection
|
||||
|
||||
This is an experiment in using hand-crafted naive bayesian filters to detecting the topic of a website.
|
||||
It's noteworthy it detects recipes very well.
|
@ -1,8 +0,0 @@
|
||||
# Crawl Features
|
||||
|
||||
These are bits of search-engine related code that are relatively isolated pieces of business logic,
|
||||
that benefit from the clarity of being kept separate from the rest of the crawling code.
|
||||
|
||||
* [content-type](content-type/) - Content Type identification
|
||||
* [crawl-blocklist](crawl-blocklist/) - IP and URL blocklists
|
||||
* [link-parser](link-parser/) - Code for parsing and normalizing links
|
@ -31,7 +31,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
implementation project(':code:processes:converting-process:ft-keyword-extraction')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'application'
|
||||
id 'org.graalvm.buildtools.native' version '0.10.2'
|
||||
}
|
||||
|
||||
java {
|
||||
@ -9,7 +11,51 @@ java {
|
||||
}
|
||||
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
sourceSets {
|
||||
main {
|
||||
java {
|
||||
srcDirs = [
|
||||
'java',
|
||||
'build/generated/source/proto/main/grpc',
|
||||
'build/generated/source/proto/main/java'
|
||||
]
|
||||
}
|
||||
resources {
|
||||
srcDirs = [ 'resources' ]
|
||||
}
|
||||
}
|
||||
test {
|
||||
java {
|
||||
srcDirs = [ 'test' ]
|
||||
}
|
||||
resources {
|
||||
srcDirs = [ 'test-resources' ]
|
||||
}
|
||||
}
|
||||
demo {
|
||||
java {
|
||||
srcDirs = [ 'demo' ]
|
||||
}
|
||||
resources {
|
||||
srcDirs = [ 'demo-resources' ]
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'demo.OneBillionRowsDemo'
|
||||
}
|
||||
|
||||
graalvmNative {
|
||||
binaries.all {
|
||||
resources.autodetect()
|
||||
buildArgs=['-H:+ForeignAPISupport', '-H:+UnlockExperimentalVMOptions']
|
||||
|
||||
}
|
||||
|
||||
toolchainDetection = false
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
@ -24,7 +70,14 @@ dependencies {
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation libs.sqlite
|
||||
demoImplementation sourceSets.main.output
|
||||
demoImplementation libs.bundles.slf4j
|
||||
demoImplementation libs.notnull
|
||||
demoImplementation libs.commons.lang3
|
||||
demoImplementation libs.lz4
|
||||
demoImplementation libs.commons.compress
|
||||
demoImplementation libs.zstd
|
||||
demoImplementation libs.duckdb
|
||||
}
|
||||
|
||||
test {
|
||||
|
146
code/libraries/slop/readme.md
Normal file
146
code/libraries/slop/readme.md
Normal file
@ -0,0 +1,146 @@
|
||||
# Slop
|
||||
|
||||
Slop is a library for columnar data persistence. It is designed to be used for storing large amounts of data in a way
|
||||
that is both fast and memory-efficient. The data is write-once, and the slop library offers many facilities for
|
||||
deciding how it should be stored and accessed.
|
||||
|
||||
Slop is designed as a low abstraction what-you-see-is-what-you-do library, the reason for
|
||||
this is to be able to eliminate copies and other overheads that are common in higher
|
||||
level libraries. The intent is to get the performance of a hand-rolled solution, but
|
||||
without the complexity and brittleness that comes with hand-rolling an ad-hoc row-based storage
|
||||
format.
|
||||
|
||||
A lot of what would commonly be kept in a schema description is instead just
|
||||
implemented as code. To aid with portability, slop stores schema information
|
||||
in the file names of the data files, besides the actual name of the column itself.
|
||||
|
||||
A table of demographic information may end up stored in files like this:
|
||||
|
||||
```text
|
||||
cities.0.dat.s8[].gz
|
||||
cities.0.dat-len.varint-le.bin
|
||||
population.0.dat.s32le.bin
|
||||
average-age.0.dat.f64le.gz
|
||||
```
|
||||
|
||||
The slop library offers some facilities to aid with data integrity, such as the SlopTable
|
||||
class, which is a wrapper that ensures consistent positions for a group of columns, and aids
|
||||
in closing the columns when they are no longer needed.
|
||||
|
||||
## Why though?
|
||||
|
||||
Slop is fast.
|
||||
|
||||
Depending on compression and encoding choices, it's possible
|
||||
to get read speeds that are 5-20x faster than reading from a sqlite database.
|
||||
When compression is disabled, Slop will memory map the data, and depending on the
|
||||
contents of the column, it's possible to perform zero copy reads.
|
||||
|
||||
Slop is compact.
|
||||
|
||||
Depending on compression and encoding choices, the format will be smaller
|
||||
than a parquet file containing the equivalent information.
|
||||
|
||||
Slop is simple.
|
||||
|
||||
There isn't much magic going on under the hood in Slop. It's designed with the philosophy that a competent programmer
|
||||
should be able to reverse engineer the format of the data by just
|
||||
looking at a directory listing of the data files.
|
||||
|
||||
|
||||
### Relaxed 1BRC (no CSV ingestion time)
|
||||
|
||||
Slop is reasonably competitive with DuckDB in terms of read speed,
|
||||
especially when reading from Parquet, and the data on disk tends
|
||||
to be smaller.
|
||||
|
||||
This is noteworthy given Slop is a single-threaded JVM application,
|
||||
and DuckDB is a multi-threaded C++ application.
|
||||
|
||||
| Impl | Runtime | Size On Disk |
|
||||
|----------------------------|---------|--------------|
|
||||
| DuckDB in memory | 2.6s | 3.0 GB |
|
||||
| Slop in vanilla Java s16 | 4.2s | 2.8 GB |
|
||||
| Slop in vanilla Java s32 | 4.5s | 3.8 GB |
|
||||
| Parquet (Snappy) in DuckDB | 4.5s | 5.5 GB |
|
||||
| Parquet (Zstd) in DuckDB | 5.5s | 3.0 GB |
|
||||
|
||||
## Example
|
||||
|
||||
With slop it's desirable to keep the schema information in the code. This is an example of how you might use slop to
|
||||
store a table of data with three columns: source, dest, and counts. The source and dest columns are strings, and the
|
||||
counts column is an integer that's stored wit a varint-coding (i.e. like how utf-8 works).
|
||||
|
||||
The data is stored in a directory, and the data is written and read using the `MyData.Writer` and `MyData.Reader` classes.
|
||||
The `MyData` class is itself is a record, and the schema is stored as static fields in the `MyData` class.
|
||||
|
||||
|
||||
```java
|
||||
record Population(String city, int population, double avgAge) {
|
||||
|
||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> citiesColumn =
|
||||
new ColumnDesc<>("cities", ColumnType.STRING, StorageType.GZIP);
|
||||
private static final ColumnDesc<IntColumnReader, IntColumnWriter> populationColumn =
|
||||
new ColumnDesc<>("population", ColumnType.INT_LE, StorageType.PLAIN);
|
||||
private static final ColumnDesc<DoubleColumnReader, DoubleColumnWriter> averageAgeColumnn =
|
||||
new ColumnDesc<>("average-age", ColumnType.DOUBLE_LE, StorageType.PLAIN);
|
||||
|
||||
public static class Writer extends SlopTable {
|
||||
private final StringColumnWriter citiesWriter;
|
||||
private final IntColumnWriter populationWriter;
|
||||
private final DoubleColumnWriter avgAgeWriter;
|
||||
|
||||
public Writer(Path baseDir) throws IOException {
|
||||
citiesWriter = citiesColumn.create(this, baseDir);
|
||||
populationWriter = populationColumn.create(this, baseDir);
|
||||
avgAgeWriter = averageAgeColumnn.create(this, baseDir);
|
||||
}
|
||||
|
||||
public void write(Population data) throws IOException {
|
||||
citiesWriter.put(data.city);
|
||||
populationWriter.put(data.population);
|
||||
avgAgeWriter.put(data.avgAge);
|
||||
}
|
||||
}
|
||||
|
||||
public static class Reader extends SlopTable {
|
||||
private final StringColumnReader citiesReader;
|
||||
private final IntColumnReader populationReader;
|
||||
private final DoubleColumnReader avgAgeReader;
|
||||
|
||||
public Reader(Path baseDir) throws IOException {
|
||||
citiesReader = citiesColumn.open(this, baseDir);
|
||||
populationReader = populationColumn.open(this, baseDir);
|
||||
avgAgeReader = averageAgeColumnn.open(this, baseDir);
|
||||
}
|
||||
|
||||
public boolean hasRemaining() throws IOException {
|
||||
return citiesReader.hasRemaining();
|
||||
}
|
||||
|
||||
public Population read() throws IOException {
|
||||
return new Population(
|
||||
citiesReader.get(),
|
||||
populationReader.get(),
|
||||
avgAgeReader.get()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Nested Records
|
||||
|
||||
TBW
|
||||
|
||||
## Column Types
|
||||
|
||||
TBW
|
||||
|
||||
## Storage Types
|
||||
|
||||
TBW
|
||||
|
||||
## Extension
|
||||
|
||||
TBW
|
@ -47,18 +47,12 @@ dependencies {
|
||||
implementation project(':code:processes:converting-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
||||
implementation project(':code:features-convert:adblock')
|
||||
implementation project(':code:features-convert:anchor-keywords')
|
||||
implementation project(':code:features-convert:topic-detection')
|
||||
implementation project(':code:features-convert:pubdate')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
implementation project(':code:features-convert:summary-extraction')
|
||||
implementation project(':code:features-convert:stackexchange-xml')
|
||||
implementation project(':code:features-convert:reddit-json')
|
||||
implementation project(':code:processes:converting-process:ft-anchor-keywords')
|
||||
implementation project(':code:processes:converting-process:ft-keyword-extraction')
|
||||
|
||||
implementation project(':code:features-crawl:crawl-blocklist')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-crawl:content-type')
|
||||
implementation project(':code:processes:crawling-process:ft-crawl-blocklist')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
|
||||
testImplementation project(':code:libraries:term-frequency-dict')
|
||||
testImplementation project(':code:processes:crawling-process:model')
|
||||
|
@ -17,7 +17,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
implementation project(':code:processes:converting-process:ft-keyword-extraction')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.adblock;
|
||||
package nu.marginalia.converting.processor.classifier.adblock;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.adblock;
|
||||
package nu.marginalia.converting.processor.classifier.adblock;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.topic;
|
||||
package nu.marginalia.converting.processor.classifier.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.topic;
|
||||
package nu.marginalia.converting.processor.classifier.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.topic;
|
||||
package nu.marginalia.converting.processor.classifier.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
@ -2,14 +2,14 @@ package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.adblock.AdblockSimulator;
|
||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator;
|
||||
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.topic.RecipeDetector;
|
||||
import nu.marginalia.topic.TextileCraftDetector;
|
||||
import nu.marginalia.topic.WoodworkingDetector;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
@ -12,6 +12,7 @@ import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecializations;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateSniffer;
|
||||
import nu.marginalia.gregex.GuardedRegex;
|
||||
import nu.marginalia.gregex.GuardedRegexFactory;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
@ -29,7 +30,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.pubdate.PubDateSniffer;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -4,7 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -4,7 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.pubdate;
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
public enum PubDateEffortLevel {
|
||||
LOW,
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.pubdate;
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.pubdate;
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.pubdate;
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
|
||||
import java.time.DateTimeException;
|
||||
import java.time.LocalDate;
|
@ -1,9 +1,9 @@
|
||||
package nu.marginalia.pubdate;
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.converting.processor.pubdate.heuristic.*;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.heuristic.*;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.ArrayList;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
@ -1,12 +1,12 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateFromHtmlStandard;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Node;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,16 +1,16 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.gson.JsonSyntaxException;
|
||||
import com.google.gson.annotations.SerializedName;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Collections;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.summary;
|
||||
package nu.marginalia.converting.processor.summary;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.summary.heuristic.*;
|
||||
import nu.marginalia.converting.processor.summary.heuristic.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
package nu.marginalia.converting.processor.summary.heuristic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
package nu.marginalia.converting.processor.summary.heuristic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
package nu.marginalia.converting.processor.summary.heuristic;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user