From eaeb23d41e51e2ce6cc5a836cbb734c47e7267a6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 14 Sep 2023 11:21:44 +0200 Subject: [PATCH] (refactor) Remove converting-model package completely --- .../marginalia/model/html}/HtmlStandard.java | 4 +- code/features-convert/pubdate/build.gradle | 1 - .../pubdate/PubDateFromHtmlStandard.java | 2 +- .../marginalia/pubdate/PubDateHeuristic.java | 2 +- .../nu/marginalia/pubdate/PubDateParser.java | 2 +- .../nu/marginalia/pubdate/PubDateSniffer.java | 2 +- .../PubDateHeuristicDOMParsingPass1.java | 2 +- .../PubDateHeuristicDOMParsingPass2.java | 2 +- ...PubDateHeuristicGuessFromHtmlStandard.java | 2 +- .../PubDateHeuristicHtml5AnyTimeTag.java | 2 +- .../PubDateHeuristicHtml5ArticleDateTag.java | 2 +- .../PubDateHeuristicHtml5ItempropDateTag.java | 2 +- .../heuristic/PubDateHeuristicJSONLD.java | 2 +- .../PubDateHeuristicLastModified.java | 2 +- .../heuristic/PubDateHeuristicMicrodata.java | 2 +- .../heuristic/PubDateHeuristicOpenGraph.java | 2 +- .../heuristic/PubDateHeuristicRDFaTag.java | 2 +- .../PubDateHeuristicUrlPatternPass1.java | 2 +- .../PubDateHeuristicUrlPatternPass2.java | 2 +- .../pubdate/PubDateSnifferTest.java | 2 +- .../converting-model/build.gradle | 47 ------------------- .../process-models/converting-model/readme.md | 3 -- .../processes/converting-process/build.gradle | 1 - .../model/ProcessedDocumentDetails.java | 1 + .../processor/logic/DocumentValuator.java | 3 +- .../logic/HtmlStandardExtractor.java | 2 +- .../processor/logic/links/FileLinks.java | 1 - .../AbstractDocumentProcessorPlugin.java | 3 +- .../plugin/HtmlDocumentProcessorPlugin.java | 3 +- .../PlainTextDocumentProcessorPlugin.java | 3 +- .../sideload/StackexchangeSideloader.java | 1 + .../converting/ConvertingIntegrationTest.java | 2 +- code/processes/crawling-process/build.gradle | 1 - code/processes/loading-process/build.gradle | 1 - code/processes/readme.md | 27 +++++------ code/tools/experiment-runner/build.gradle | 1 - settings.gradle | 1 - 37 files changed, 40 insertions(+), 102 deletions(-) rename code/{process-models/converting-model/src/main/java/nu/marginalia/converting/model => common/model/src/main/java/nu/marginalia/model/html}/HtmlStandard.java (78%) delete mode 100644 code/process-models/converting-model/build.gradle delete mode 100644 code/process-models/converting-model/readme.md diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java b/code/common/model/src/main/java/nu/marginalia/model/html/HtmlStandard.java similarity index 78% rename from code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java rename to code/common/model/src/main/java/nu/marginalia/model/html/HtmlStandard.java index ecb3d630..cdd23742 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java +++ b/code/common/model/src/main/java/nu/marginalia/model/html/HtmlStandard.java @@ -1,6 +1,6 @@ -package nu.marginalia.converting.model; - +package nu.marginalia.model.html; +// This class really doesn't belong anywhere, but will squat here for now public enum HtmlStandard { PLAIN(0, 1), UNKNOWN(0, 1), diff --git a/code/features-convert/pubdate/build.gradle b/code/features-convert/pubdate/build.gradle index 1535b203..ee256ebf 100644 --- a/code/features-convert/pubdate/build.gradle +++ b/code/features-convert/pubdate/build.gradle @@ -15,7 +15,6 @@ java { dependencies { implementation project(':code:common:model') - implementation project(':code:process-models:converting-model') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java index d7777e0e..dfbab8d3 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; public class PubDateFromHtmlStandard { /** Used to bias pub date heuristics */ diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java index ddc3b9c4..56355806 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java @@ -2,7 +2,7 @@ package nu.marginalia.pubdate; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java index 1abd84dd..1fbade80 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import java.time.DateTimeException; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java index b8b9b704..90b25915 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.EdgeUrl; import nu.marginalia.pubdate.heuristic.*; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java index 5f8c7ffc..28059f64 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java index 2bcf5dab..bb625180 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateFromHtmlStandard; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java index c082f555..30486f2f 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateHeuristic; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java index ac8dbf01..30513a47 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java index 0bcb28dd..45c8b091 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java index ac88fcb4..aa09d392 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java index d3173b81..3ddf58eb 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java @@ -5,7 +5,7 @@ import com.google.gson.GsonBuilder; import com.google.gson.JsonSyntaxException; import com.google.gson.annotations.SerializedName; import lombok.ToString; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java index 69a780b9..ca42d469 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java index 62a16f5a..584375f2 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java index 023e954c..74a7a654 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateHeuristic; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java index 18bd7e80..1ed20019 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateHeuristic; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java index 65b1d4da..6a6d5630 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java index e05a10ef..ea3ab9d9 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java index 1794c196..efd320e8 100644 --- a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java +++ b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java @@ -2,7 +2,7 @@ package nu.marginalia.pubdate; import nu.marginalia.WmsaHome; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; diff --git a/code/process-models/converting-model/build.gradle b/code/process-models/converting-model/build.gradle deleted file mode 100644 index cb25b932..00000000 --- a/code/process-models/converting-model/build.gradle +++ /dev/null @@ -1,47 +0,0 @@ -plugins { - id 'java' - id "io.freefair.lombok" version "8.2.2" - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(20)) - } -} -dependencies { - - //implementation project(':third-party:monkey-patch-gson') - - implementation project(':code:common:db') - implementation project(':code:common:model') - implementation project(':code:api:index-api') - implementation project(':code:common:service-discovery') - implementation project(':code:common:service-client') - implementation project(':code:libraries:language-processing') - - implementation project(':code:features-convert:keyword-extraction') - - implementation libs.lombok - annotationProcessor libs.lombok - implementation libs.bundles.slf4j - - implementation libs.notnull - implementation libs.trove - implementation libs.fastutil - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - -test { - useJUnitPlatform() -} - -task fastTests(type: Test) { - useJUnitPlatform { - excludeTags "slow" - } -} diff --git a/code/process-models/converting-model/readme.md b/code/process-models/converting-model/readme.md deleted file mode 100644 index 52973e48..00000000 --- a/code/process-models/converting-model/readme.md +++ /dev/null @@ -1,3 +0,0 @@ -# Converting Models - -!!To be deleted!! \ No newline at end of file diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index cb8e80e1..fdc37e75 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -43,7 +43,6 @@ dependencies { implementation project(':code:libraries:big-string') implementation project(':code:libraries:language-processing') - implementation project(':code:process-models:converting-model') implementation project(':code:process-models:processed-data') implementation project(':code:process-models:work-log') implementation project(':code:process-models:crawling-model') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java index 97cb964b..ee70fb14 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.model; import lombok.ToString; import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.EdgeUrl; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index baacb766..218f16b8 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -2,14 +2,13 @@ package nu.marginalia.converting.processor.logic; import crawlercommons.utils.Strings; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.model.crawl.HtmlFeature; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; import org.jsoup.select.NodeVisitor; import java.util.Set; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java index 52537f68..f0f994da 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.processor.logic; import com.google.common.base.Strings; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import org.jsoup.nodes.DocumentType; import org.slf4j.Logger; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java index e8809b67..10c31606 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java @@ -4,7 +4,6 @@ import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; -import java.nio.file.Path; import java.util.HashSet; import java.util.Set; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index 2d2f58ca..913ba81d 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -1,10 +1,9 @@ package nu.marginalia.converting.processor.plugin; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.converting.language.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.converting.model.DisqualifiedException; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 7d48bf3b..c51e9690 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -13,10 +13,9 @@ import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentMetadata; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index bc288430..797b3b6d 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -5,10 +5,9 @@ import com.google.inject.name.Named; import nu.marginalia.converting.language.LanguageFilter; import nu.marginalia.converting.processor.logic.DocumentLengthLogic; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentMetadata; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java index 0f40639b..07ad6391 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java @@ -10,6 +10,7 @@ import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index d43ddecf..5c6ebe81 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -3,7 +3,7 @@ package nu.marginalia.converting; import com.google.inject.Guice; import com.google.inject.Injector; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawling.io.SerializableCrawlDataStream; diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 5dfe0556..27b7bf32 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -34,7 +34,6 @@ dependencies { implementation project(':code:libraries:language-processing') implementation project(':code:libraries:easy-lsh') implementation project(':code:process-models:crawling-model') - implementation project(':code:process-models:converting-model') implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index 736fec8d..a890ec23 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -35,7 +35,6 @@ dependencies { testImplementation project(':code:services-core:search-service') implementation project(':code:process-models:crawling-model') - implementation project(':code:process-models:converting-model') implementation project(':code:process-models:processed-data') implementation project(':code:process-models:work-log') diff --git a/code/processes/readme.md b/code/processes/readme.md index 6b1ccede..b2839a09 100644 --- a/code/processes/readme.md +++ b/code/processes/readme.md @@ -11,13 +11,17 @@ based on the content in the database. ## 2. Converting Process The [converting-process](converting-process/) reads crawl data from the crawling step and -processes them, extracting keywords and metadata and saves them as compressed JSON models -described in [converting-model](../process-models/converting-model/). +processes them, extracting keywords and metadata and saves them as parquet files +described in [processed-data](../process-models/processed-data/). ## 3. Loading Process -The [loading-process](loading-process/) reads the processed data and creates an index journal -and lexicon, and loads domains and addresses into the MariaDB-database. +The [loading-process](loading-process/) reads the processed data. + +It has creates an [index journal](../features-index/index-journal), +a [link database](../common/linkdb), +and loads domains and domain-links +into the [MariaDB database](../common/db). ## 4. Index Construction Process @@ -56,21 +60,14 @@ Schematically the crawling and loading process looks like this: +------------+ features, links, URLs | //==================\\ - || Compressed JSON: || Processed - || URLs[] || Files + || Parquet: || Processed + || Documents[] || Files || Domains[] || || Links[] || - || Keywords[] || - || ... || - || URLs[] || - || Domains[] || - || Links[] || - || Keywords[] || - || ... || \\==================// | - +------------+ - | LOADING | Insert URLs in link DB + +------------+ Insert domains into mariadb + | LOADING | Insert URLs, titles in link DB | STEP | Insert keywords in Index +------------+ | diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index 77d84e21..3eb4c244 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -31,7 +31,6 @@ dependencies { implementation project(':code:libraries:term-frequency-dict') implementation project(':code:libraries:big-string') implementation project(':code:processes:converting-process') - implementation project(':code:process-models:converting-model') implementation project(':code:process-models:crawling-model') implementation project(':code:features-convert:adblock') diff --git a/settings.gradle b/settings.gradle index af44349d..dc42ead8 100644 --- a/settings.gradle +++ b/settings.gradle @@ -63,7 +63,6 @@ include 'code:processes:loading-process' include 'code:processes:index-constructor-process' include 'code:processes:test-data' -include 'code:process-models:converting-model' include 'code:process-models:crawling-model' include 'code:process-models:work-log' include 'code:process-models:processed-data'