mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(refactor) Remove converting-model package completely
This commit is contained in:
parent
c71f6ad417
commit
eaeb23d41e
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
package nu.marginalia.model.html;
|
||||
|
||||
// This class really doesn't belong anywhere, but will squat here for now
|
||||
public enum HtmlStandard {
|
||||
PLAIN(0, 1),
|
||||
UNKNOWN(0, 1),
|
@ -15,7 +15,6 @@ java {
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:process-models:converting-model')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
|
||||
public class PubDateFromHtmlStandard {
|
||||
/** Used to bias pub date heuristics */
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
|
||||
import java.time.DateTimeException;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.heuristic.*;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateFromHtmlStandard;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
|
@ -5,7 +5,7 @@ import com.google.gson.GsonBuilder;
|
||||
import com.google.gson.JsonSyntaxException;
|
||||
import com.google.gson.annotations.SerializedName;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
@ -1,47 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "8.2.2"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(20))
|
||||
}
|
||||
}
|
||||
dependencies {
|
||||
|
||||
//implementation project(':third-party:monkey-patch-gson')
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:api:index-api')
|
||||
implementation project(':code:common:service-discovery')
|
||||
implementation project(':code:common:service-client')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
task fastTests(type: Test) {
|
||||
useJUnitPlatform {
|
||||
excludeTags "slow"
|
||||
}
|
||||
}
|
@ -1,3 +0,0 @@
|
||||
# Converting Models
|
||||
|
||||
!!To be deleted!!
|
@ -43,7 +43,6 @@ dependencies {
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation project(':code:process-models:converting-model')
|
||||
implementation project(':code:process-models:processed-data')
|
||||
implementation project(':code:process-models:work-log')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.model;
|
||||
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
|
@ -2,14 +2,13 @@ package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import crawlercommons.utils.Strings;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.NodeVisitor;
|
||||
|
||||
import java.util.Set;
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.DocumentType;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -4,7 +4,6 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -1,10 +1,9 @@
|
||||
package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
|
@ -13,10 +13,9 @@ import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
|
@ -5,10 +5,9 @@ import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.converting;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
|
@ -34,7 +34,6 @@ dependencies {
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
implementation project(':code:process-models:converting-model')
|
||||
|
||||
implementation project(':code:features-crawl:crawl-blocklist')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
|
@ -35,7 +35,6 @@ dependencies {
|
||||
testImplementation project(':code:services-core:search-service')
|
||||
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
implementation project(':code:process-models:converting-model')
|
||||
implementation project(':code:process-models:processed-data')
|
||||
implementation project(':code:process-models:work-log')
|
||||
|
||||
|
@ -11,13 +11,17 @@ based on the content in the database.
|
||||
## 2. Converting Process
|
||||
|
||||
The [converting-process](converting-process/) reads crawl data from the crawling step and
|
||||
processes them, extracting keywords and metadata and saves them as compressed JSON models
|
||||
described in [converting-model](../process-models/converting-model/).
|
||||
processes them, extracting keywords and metadata and saves them as parquet files
|
||||
described in [processed-data](../process-models/processed-data/).
|
||||
|
||||
## 3. Loading Process
|
||||
|
||||
The [loading-process](loading-process/) reads the processed data and creates an index journal
|
||||
and lexicon, and loads domains and addresses into the MariaDB-database.
|
||||
The [loading-process](loading-process/) reads the processed data.
|
||||
|
||||
It has creates an [index journal](../features-index/index-journal),
|
||||
a [link database](../common/linkdb),
|
||||
and loads domains and domain-links
|
||||
into the [MariaDB database](../common/db).
|
||||
|
||||
## 4. Index Construction Process
|
||||
|
||||
@ -56,21 +60,14 @@ Schematically the crawling and loading process looks like this:
|
||||
+------------+ features, links, URLs
|
||||
|
|
||||
//==================\\
|
||||
|| Compressed JSON: || Processed
|
||||
|| URLs[] || Files
|
||||
|| Parquet: || Processed
|
||||
|| Documents[] || Files
|
||||
|| Domains[] ||
|
||||
|| Links[] ||
|
||||
|| Keywords[] ||
|
||||
|| ... ||
|
||||
|| URLs[] ||
|
||||
|| Domains[] ||
|
||||
|| Links[] ||
|
||||
|| Keywords[] ||
|
||||
|| ... ||
|
||||
\\==================//
|
||||
|
|
||||
+------------+
|
||||
| LOADING | Insert URLs in link DB
|
||||
+------------+ Insert domains into mariadb
|
||||
| LOADING | Insert URLs, titles in link DB
|
||||
| STEP | Insert keywords in Index
|
||||
+------------+
|
||||
|
|
||||
|
@ -31,7 +31,6 @@ dependencies {
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:process-models:converting-model')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
||||
implementation project(':code:features-convert:adblock')
|
||||
|
@ -63,7 +63,6 @@ include 'code:processes:loading-process'
|
||||
include 'code:processes:index-constructor-process'
|
||||
include 'code:processes:test-data'
|
||||
|
||||
include 'code:process-models:converting-model'
|
||||
include 'code:process-models:crawling-model'
|
||||
include 'code:process-models:work-log'
|
||||
include 'code:process-models:processed-data'
|
||||
|
Loading…
Reference in New Issue
Block a user