(refactor) Remove converting-model package completely

This commit is contained in:
Viktor Lofgren 2023-09-14 11:21:44 +02:00
parent c71f6ad417
commit eaeb23d41e
37 changed files with 40 additions and 102 deletions

View File

@ -1,6 +1,6 @@
package nu.marginalia.converting.model; package nu.marginalia.model.html;
// This class really doesn't belong anywhere, but will squat here for now
public enum HtmlStandard { public enum HtmlStandard {
PLAIN(0, 1), PLAIN(0, 1),
UNKNOWN(0, 1), UNKNOWN(0, 1),

View File

@ -15,7 +15,6 @@ java {
dependencies { dependencies {
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:process-models:converting-model')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate; package nu.marginalia.pubdate;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
public class PubDateFromHtmlStandard { public class PubDateFromHtmlStandard {
/** Used to bias pub date heuristics */ /** Used to bias pub date heuristics */

View File

@ -2,7 +2,7 @@ package nu.marginalia.pubdate;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.Optional; import java.util.Optional;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate; package nu.marginalia.pubdate;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import java.time.DateTimeException; import java.time.DateTimeException;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate; package nu.marginalia.pubdate;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.heuristic.*; import nu.marginalia.pubdate.heuristic.*;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateFromHtmlStandard; import nu.marginalia.pubdate.PubDateFromHtmlStandard;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;

View File

@ -5,7 +5,7 @@ import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException; import com.google.gson.JsonSyntaxException;
import com.google.gson.annotations.SerializedName; import com.google.gson.annotations.SerializedName;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic; package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.pubdate.PubDateParser;

View File

@ -2,7 +2,7 @@ package nu.marginalia.pubdate;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -1,47 +0,0 @@
plugins {
id 'java'
id "io.freefair.lombok" version "8.2.2"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(20))
}
}
dependencies {
//implementation project(':third-party:monkey-patch-gson')
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:api:index-api')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:libraries:language-processing')
implementation project(':code:features-convert:keyword-extraction')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.trove
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}
task fastTests(type: Test) {
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -1,3 +0,0 @@
# Converting Models
!!To be deleted!!

View File

@ -43,7 +43,6 @@ dependencies {
implementation project(':code:libraries:big-string') implementation project(':code:libraries:big-string')
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')
implementation project(':code:process-models:converting-model')
implementation project(':code:process-models:processed-data') implementation project(':code:process-models:processed-data')
implementation project(':code:process-models:work-log') implementation project(':code:process-models:work-log')
implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:crawling-model')

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.model;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;

View File

@ -2,14 +2,13 @@ package nu.marginalia.converting.processor.logic;
import crawlercommons.utils.Strings; import crawlercommons.utils.Strings;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor; import org.jsoup.select.NodeVisitor;
import java.util.Set; import java.util.Set;

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.processor.logic; package nu.marginalia.converting.processor.logic;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType; import org.jsoup.nodes.DocumentType;
import org.slf4j.Logger; import org.slf4j.Logger;

View File

@ -4,7 +4,6 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.nio.file.Path;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;

View File

@ -1,10 +1,9 @@
package nu.marginalia.converting.processor.plugin; package nu.marginalia.converting.processor.plugin;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.converting.language.LanguageFilter; import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;

View File

@ -13,10 +13,9 @@ import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.link_parser.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;

View File

@ -5,10 +5,9 @@ import com.google.inject.name.Named;
import nu.marginalia.converting.language.LanguageFilter; import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.converting.processor.logic.DocumentLengthLogic; import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;

View File

@ -10,6 +10,7 @@ import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;

View File

@ -3,7 +3,7 @@ package nu.marginalia.converting;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Injector; import com.google.inject.Injector;
import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.io.SerializableCrawlDataStream;

View File

@ -34,7 +34,6 @@ dependencies {
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:easy-lsh')
implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:crawling-model')
implementation project(':code:process-models:converting-model')
implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser') implementation project(':code:features-crawl:link-parser')

View File

@ -35,7 +35,6 @@ dependencies {
testImplementation project(':code:services-core:search-service') testImplementation project(':code:services-core:search-service')
implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:crawling-model')
implementation project(':code:process-models:converting-model')
implementation project(':code:process-models:processed-data') implementation project(':code:process-models:processed-data')
implementation project(':code:process-models:work-log') implementation project(':code:process-models:work-log')

View File

@ -11,13 +11,17 @@ based on the content in the database.
## 2. Converting Process ## 2. Converting Process
The [converting-process](converting-process/) reads crawl data from the crawling step and The [converting-process](converting-process/) reads crawl data from the crawling step and
processes them, extracting keywords and metadata and saves them as compressed JSON models processes them, extracting keywords and metadata and saves them as parquet files
described in [converting-model](../process-models/converting-model/). described in [processed-data](../process-models/processed-data/).
## 3. Loading Process ## 3. Loading Process
The [loading-process](loading-process/) reads the processed data and creates an index journal The [loading-process](loading-process/) reads the processed data.
and lexicon, and loads domains and addresses into the MariaDB-database.
It has creates an [index journal](../features-index/index-journal),
a [link database](../common/linkdb),
and loads domains and domain-links
into the [MariaDB database](../common/db).
## 4. Index Construction Process ## 4. Index Construction Process
@ -56,21 +60,14 @@ Schematically the crawling and loading process looks like this:
+------------+ features, links, URLs +------------+ features, links, URLs
| |
//==================\\ //==================\\
|| Compressed JSON: || Processed || Parquet: || Processed
|| URLs[] || Files || Documents[] || Files
|| Domains[] || || Domains[] ||
|| Links[] || || Links[] ||
|| Keywords[] ||
|| ... ||
|| URLs[] ||
|| Domains[] ||
|| Links[] ||
|| Keywords[] ||
|| ... ||
\\==================// \\==================//
| |
+------------+ +------------+ Insert domains into mariadb
| LOADING | Insert URLs in link DB | LOADING | Insert URLs, titles in link DB
| STEP | Insert keywords in Index | STEP | Insert keywords in Index
+------------+ +------------+
| |

View File

@ -31,7 +31,6 @@ dependencies {
implementation project(':code:libraries:term-frequency-dict') implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:big-string') implementation project(':code:libraries:big-string')
implementation project(':code:processes:converting-process') implementation project(':code:processes:converting-process')
implementation project(':code:process-models:converting-model')
implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:crawling-model')
implementation project(':code:features-convert:adblock') implementation project(':code:features-convert:adblock')

View File

@ -63,7 +63,6 @@ include 'code:processes:loading-process'
include 'code:processes:index-constructor-process' include 'code:processes:index-constructor-process'
include 'code:processes:test-data' include 'code:processes:test-data'
include 'code:process-models:converting-model'
include 'code:process-models:crawling-model' include 'code:process-models:crawling-model'
include 'code:process-models:work-log' include 'code:process-models:work-log'
include 'code:process-models:processed-data' include 'code:process-models:processed-data'