(refactor) Remove converting-model package completely

This commit is contained in:
Viktor Lofgren 2023-09-14 11:21:44 +02:00
parent c71f6ad417
commit eaeb23d41e
37 changed files with 40 additions and 102 deletions

View File

@ -1,6 +1,6 @@
package nu.marginalia.converting.model;
package nu.marginalia.model.html;
// This class really doesn't belong anywhere, but will squat here for now
public enum HtmlStandard {
PLAIN(0, 1),
UNKNOWN(0, 1),

View File

@ -15,7 +15,6 @@ java {
dependencies {
implementation project(':code:common:model')
implementation project(':code:process-models:converting-model')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
public class PubDateFromHtmlStandard {
/** Used to bias pub date heuristics */

View File

@ -2,7 +2,7 @@ package nu.marginalia.pubdate;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import java.time.DateTimeException;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.heuristic.*;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateFromHtmlStandard;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -5,7 +5,7 @@ import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException;
import com.google.gson.annotations.SerializedName;
import lombok.ToString;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -2,7 +2,7 @@ package nu.marginalia.pubdate;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;

View File

@ -1,47 +0,0 @@
plugins {
id 'java'
id "io.freefair.lombok" version "8.2.2"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(20))
}
}
dependencies {
//implementation project(':third-party:monkey-patch-gson')
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:api:index-api')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:libraries:language-processing')
implementation project(':code:features-convert:keyword-extraction')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.trove
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}
task fastTests(type: Test) {
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -1,3 +0,0 @@
# Converting Models
!!To be deleted!!

View File

@ -43,7 +43,6 @@ dependencies {
implementation project(':code:libraries:big-string')
implementation project(':code:libraries:language-processing')
implementation project(':code:process-models:converting-model')
implementation project(':code:process-models:processed-data')
implementation project(':code:process-models:work-log')
implementation project(':code:process-models:crawling-model')

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.model;
import lombok.ToString;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.EdgeUrl;

View File

@ -2,14 +2,13 @@ package nu.marginalia.converting.processor.logic;
import crawlercommons.utils.Strings;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.model.crawl.HtmlFeature;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;
import java.util.Set;

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.processor.logic;
import com.google.common.base.Strings;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.slf4j.Logger;

View File

@ -4,7 +4,6 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.Set;

View File

@ -1,10 +1,9 @@
package nu.marginalia.converting.processor.plugin;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.model.DisqualifiedException;

View File

@ -13,10 +13,9 @@ import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.DocumentMetadata;

View File

@ -5,10 +5,9 @@ import com.google.inject.name.Named;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.DocumentMetadata;

View File

@ -10,6 +10,7 @@ import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;

View File

@ -3,7 +3,7 @@ package nu.marginalia.converting;
import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;

View File

@ -34,7 +34,6 @@ dependencies {
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:process-models:crawling-model')
implementation project(':code:process-models:converting-model')
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')

View File

@ -35,7 +35,6 @@ dependencies {
testImplementation project(':code:services-core:search-service')
implementation project(':code:process-models:crawling-model')
implementation project(':code:process-models:converting-model')
implementation project(':code:process-models:processed-data')
implementation project(':code:process-models:work-log')

View File

@ -11,13 +11,17 @@ based on the content in the database.
## 2. Converting Process
The [converting-process](converting-process/) reads crawl data from the crawling step and
processes them, extracting keywords and metadata and saves them as compressed JSON models
described in [converting-model](../process-models/converting-model/).
processes them, extracting keywords and metadata and saves them as parquet files
described in [processed-data](../process-models/processed-data/).
## 3. Loading Process
The [loading-process](loading-process/) reads the processed data and creates an index journal
and lexicon, and loads domains and addresses into the MariaDB-database.
The [loading-process](loading-process/) reads the processed data.
It has creates an [index journal](../features-index/index-journal),
a [link database](../common/linkdb),
and loads domains and domain-links
into the [MariaDB database](../common/db).
## 4. Index Construction Process
@ -56,21 +60,14 @@ Schematically the crawling and loading process looks like this:
+------------+ features, links, URLs
|
//==================\\
|| Compressed JSON: || Processed
|| URLs[] || Files
|| Parquet: || Processed
|| Documents[] || Files
|| Domains[] ||
|| Links[] ||
|| Keywords[] ||
|| ... ||
|| URLs[] ||
|| Domains[] ||
|| Links[] ||
|| Keywords[] ||
|| ... ||
\\==================//
|
+------------+
| LOADING | Insert URLs in link DB
+------------+ Insert domains into mariadb
| LOADING | Insert URLs, titles in link DB
| STEP | Insert keywords in Index
+------------+
|

View File

@ -31,7 +31,6 @@ dependencies {
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:big-string')
implementation project(':code:processes:converting-process')
implementation project(':code:process-models:converting-model')
implementation project(':code:process-models:crawling-model')
implementation project(':code:features-convert:adblock')

View File

@ -63,7 +63,6 @@ include 'code:processes:loading-process'
include 'code:processes:index-constructor-process'
include 'code:processes:test-data'
include 'code:process-models:converting-model'
include 'code:process-models:crawling-model'
include 'code:process-models:work-log'
include 'code:process-models:processed-data'