From 6d939175b104d675fa1a49c49c48375701303937 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 11 Mar 2023 13:48:40 +0100 Subject: [PATCH] Additional code restructuring to get rid of util and misc-style packages. --- .../results/SearchResultKeywordScore.java | 26 +++++----- code/common/config/build.gradle | 1 - code/common/model/build.gradle | 1 + ...ingState.java => DomainIndexingState.java} | 4 +- .../model/crawl/EdgeContentType.java | 15 ------ ...dgeUrlState.java => UrlIndexingState.java} | 2 +- ...ainBlacklist.java => DomainBlacklist.java} | 4 +- ...listImpl.java => DomainBlacklistImpl.java} | 5 +- .../DocumentFlags.java} | 10 ++-- .../model/idx/DocumentMetadata.java | 7 ++- .../WordFlags.java} | 21 +++----- .../nu/marginalia/model/idx/WordMetadata.java | 14 +++--- .../java/nu/marginalia/util/QueryParams.java | 1 - .../model/DocumentMetadataTest.java | 6 +-- .../nu/marginalia/model/WordMetadataTest.java | 16 +++--- code/common/service/build.gradle | 1 - .../crawling/common/model/HtmlStandard.java} | 6 +-- code/crawl/converting-model/build.gradle | 2 + .../converting/instruction/Interpreter.java | 4 +- .../instructions/LoadProcessedDocument.java | 8 +-- .../LoadProcessedDocumentWithError.java | 4 +- .../instructions/LoadProcessedDomain.java | 4 +- .../model/DocumentKeywordsBuilder.java | 6 +-- code/crawl/converting-process/build.gradle | 13 +++-- .../marginalia/converting/ConversionLog.java | 4 +- .../converting/InstructionWriter.java | 4 +- .../converting/model/ProcessedDocument.java | 10 ++-- .../model/ProcessedDocumentDetails.java | 4 +- .../converting/model/ProcessedDomain.java | 4 +- .../processor/DocumentProcessor.java | 14 +++--- .../converting/processor/DomainProcessor.java | 13 +++-- .../converting/processor/SiteWords.java | 12 ++--- .../keywords/extractors/SimpleKeywords.java | 4 +- .../processor/logic/DocumentValuator.java | 4 +- .../processor/logic/FeatureExtractor.java | 6 ++- .../logic/HtmlStandardExtractor.java | 48 +++++++++--------- .../logic/LshDocumentDeduplicator.java | 4 +- .../logic/links/CommonKeywordExtractor.java | 6 +-- .../logic/links/InternalLinkGraph.java | 4 +- .../logic/pubdate/PubDateEffortLevel.java | 6 --- ...PubDateHeuristicGuessFromHtmlStandard.java | 23 --------- .../AbstractDocumentProcessorPlugin.java | 4 +- .../plugin/HtmlDocumentProcessorPlugin.java | 15 +++--- .../PlainTextDocumentProcessorPlugin.java | 11 ++-- .../converting}/util/LineUtils.java | 2 +- .../converting/logic/PlainTextLogicTest.java | 2 +- .../keywords/SentenceExtractorTest.java | 5 -- .../converting}/util/LineUtilsTest.java | 2 +- .../crawl/CrawlJobDomainExtractor.java | 7 ++- .../crawl/CrawlJobExtractorMain.java | 4 +- .../crawling/model/ContentType.java | 5 ++ .../crawl/retreival/HttpFetcher.java | 10 ++-- .../retreival/logic/ContentTypeParser.java | 12 ++--- code/crawl/experimental/build.gradle | 2 + .../experimental/AdblockTesterTool.java | 2 +- .../experimental/ConverterLogicTestTool.java | 8 +-- .../experimental/CrawlDataExtractorTool.java | 2 +- code/crawl/loading-process/build.gradle | 1 - .../nu/marginalia/loading/loader/Loader.java | 4 +- .../loader/SqlLoadProcessedDomain.java | 4 +- .../loader/SqlLoadProcessedDocumentTest.java | 8 +-- .../loader/SqlLoadProcessedDomainTest.java | 4 +- code/features/adblock/build.gradle | 41 +++++++++++++++ code/features/adblock/readme.md | 8 +++ .../marginalia/adblock}/AdblockSimulator.java | 2 +- .../adblock}/GoogleAnwersSpamDetector.java | 2 +- .../ranking/data/RankingDomainData.java | 8 +-- .../ranking/data/RankingDomainFetcher.java | 10 ++-- ...RankingDomainFetcherForSimilarityData.java | 4 +- .../tool/CreateBrowseDomainRanksTool.java | 4 +- .../ranking/tool/PerusePageRankV2.java | 4 +- .../ranking/tool/PrintDomainRanksTool.java | 6 +-- .../ranking/tool/UpdateDomainRanksTool.java | 4 +- code/features/pubdate/build.gradle | 44 ++++++++++++++++ code/features/pubdate/readme.md | 7 +++ .../pubdate/PubDateEffortLevel.java | 6 +++ .../marginalia}/pubdate/PubDateHeuristic.java | 6 +-- .../nu/marginalia}/pubdate/PubDateParser.java | 6 +-- .../marginalia}/pubdate/PubDateSniffer.java | 8 +-- .../PubDateHeuristicDOMParsingPass1.java | 18 +++---- .../PubDateHeuristicDOMParsingPass2.java | 18 +++---- ...PubDateHeuristicGuessFromHtmlStandard.java | 23 +++++++++ .../PubDateHeuristicHtml5AnyTimeTag.java | 12 ++--- .../PubDateHeuristicHtml5ArticleDateTag.java | 12 ++--- .../PubDateHeuristicHtml5ItempropDateTag.java | 12 ++--- .../heuristic/PubDateHeuristicJSONLD.java | 12 ++--- .../PubDateHeuristicLastModified.java | 12 ++--- .../heuristic/PubDateHeuristicMicrodata.java | 14 +++--- .../heuristic/PubDateHeuristicOpenGraph.java | 12 ++--- .../heuristic/PubDateHeuristicRDFaTag.java | 12 ++--- .../PubDateHeuristicUrlPatternPass1.java | 12 ++--- .../PubDateHeuristicUrlPatternPass2.java | 12 ++--- .../pubdate}/PubDateSnifferTest.java | 32 ++++++------ code/features/query-parser/build.gradle | 1 - .../marginalia/query_parser/QueryParser.java | 2 +- .../transform_list}/TransformList.java | 2 +- .../transform_list}/TransformListTest.java | 2 +- .../browse/DbBrowseDomainsRandom.java | 4 +- .../browse/DbBrowseDomainsSimilarCosine.java | 4 +- .../browse/DbBrowseDomainsSimilarOldAlgo.java | 6 +-- code/features/readme.md | 3 ++ code/features/topic-detection/build.gradle | 43 ++++++++++++++++ code/features/topic-detection/readme.md | 4 ++ .../nu/marginalia}/topic/RecipeDetector.java | 2 +- .../topic/TextileCraftDetector.java | 2 +- .../topic/WoodworkingDetector.java | 2 +- code/index/index-forward/build.gradle | 1 - .../ReverseIndexPriorityParameters.java | 18 +++---- code/index/lexicon/build.gradle | 2 +- .../dict/OffHeapDictionaryHashMap.java | 4 +- .../build.gradle | 3 -- .../braille-block-punch-cards/readme.md | 9 ++++ .../bbpc}/BrailleBlockPunchCards.java | 2 +- .../bbpc}/BrailleBlockPunchCardsTest.java | 2 +- code/libraries/btree/build.gradle | 2 +- .../BTreeReaderQueryDataWithIndexTest.java | 4 -- .../BTreeReaderRejectRetainWithIndexTest.java | 4 +- ...reeReaderRejectRetainWithoutIndexTest.java | 4 +- .../language/model/KeywordMetadata.java | 26 +++++----- .../language/statistics}/DenseBitMap.java | 2 +- .../language/statistics/NGramBloomFilter.java | 1 - .../language/statistics}/DenseBitMapTest.java | 6 +-- .../java/nu/marginalia/test/TestUtil.java | 50 ------------------- code/libraries/next-prime/build.gradle | 27 ++++++++++ code/libraries/next-prime/readme.md | 4 ++ .../nu/marginalia/util/NextPrimeUtil.java} | 3 +- .../nu/marginalia/util/NextPrimeUtilTest.java | 29 +++++++++++ code/libraries/readme.md | 6 +-- code/services-core/index-service/build.gradle | 1 - .../index/results/IndexResultValuator.java | 18 +++---- .../index/service/util/PrimeUtilTest.java | 30 ----------- .../svc/IndexQueryServiceIntegrationTest.java | 6 +-- .../services-core/search-service/build.gradle | 2 +- .../command/commands/BrowseCommand.java | 6 +-- .../command/commands/SearchCommand.java | 8 ++- .../search/db/DbUrlDetailsQuery.java | 4 +- .../marginalia/search/model/UrlDetails.java | 6 +-- .../search/results/SearchResultDecorator.java | 6 +-- .../siteinfo/DomainInformationService.java | 10 ++-- .../valuation/SearchResultValuator.java | 34 ++++++------- .../search/search-result-metadata.hdb | 7 ++- .../valuation/SearchResultValuatorTest.java | 24 ++++----- .../nu/marginalia/dating/DatingService.java | 6 +-- .../dating/DatingSessionObject.java | 6 +-- other/memex/build.gradle | 1 - .../java/nu/marginalia/util/FileSizeUtil.java | 0 settings.gradle | 6 ++- 147 files changed, 715 insertions(+), 600 deletions(-) rename code/common/model/src/main/java/nu/marginalia/model/crawl/{EdgeDomainIndexingState.java => DomainIndexingState.java} (81%) delete mode 100644 code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeContentType.java rename code/common/model/src/main/java/nu/marginalia/model/crawl/{EdgeUrlState.java => UrlIndexingState.java} (81%) rename code/common/model/src/main/java/nu/marginalia/model/dbcommon/{EdgeDomainBlacklist.java => DomainBlacklist.java} (83%) rename code/common/model/src/main/java/nu/marginalia/model/dbcommon/{EdgeDomainBlacklistImpl.java => DomainBlacklistImpl.java} (91%) rename code/common/model/src/main/java/nu/marginalia/model/{crawl/EdgePageDocumentFlags.java => idx/DocumentFlags.java} (65%) rename code/common/model/src/main/java/nu/marginalia/model/{crawl/EdgePageWordFlags.java => idx/WordFlags.java} (63%) rename code/{services-core/index-service/src/test/java/nu/marginalia/index => common/model/src/test/java/nu/marginalia}/model/DocumentMetadataTest.java (95%) rename code/{common/model/src/main/java/nu/marginalia/model/crawl/EdgeHtmlStandard.java => crawl/common/src/main/java/nu/marginalia/crawling/common/model/HtmlStandard.java} (79%) delete mode 100644 code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateEffortLevel.java delete mode 100644 code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java rename code/{common/model/src/main/java/nu/marginalia => crawl/converting-process/src/main/java/nu/marginalia/converting}/util/LineUtils.java (97%) rename code/{common/model/src/test/java/nu/marginalia => crawl/converting-process/src/test/java/nu/marginalia/converting}/util/LineUtilsTest.java (90%) create mode 100644 code/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java create mode 100644 code/features/adblock/build.gradle create mode 100644 code/features/adblock/readme.md rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic => features/adblock/src/main/java/nu/marginalia/adblock}/AdblockSimulator.java (98%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic => features/adblock/src/main/java/nu/marginalia/adblock}/GoogleAnwersSpamDetector.java (93%) create mode 100644 code/features/pubdate/build.gradle create mode 100644 code/features/pubdate/readme.md create mode 100644 code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateEffortLevel.java rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/PubDateHeuristic.java (56%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/PubDateParser.java (97%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/PubDateSniffer.java (87%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java (87%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java (83%) create mode 100644 code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java (64%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java (59%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java (58%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicJSONLD.java (73%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicLastModified.java (62%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicMicrodata.java (58%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicOpenGraph.java (59%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicRDFaTag.java (58%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java (74%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/pubdate/src/main/java/nu/marginalia}/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java (71%) rename code/{crawl/converting-process/src/test/java/nu/marginalia/converting/logic => features/pubdate/src/test/java/nu/marginalia/pubdate}/PubDateSnifferTest.java (90%) rename code/{libraries/misc/src/main/java/nu/marginalia/util => features/query-parser/src/main/java/nu/marginalia/transform_list}/TransformList.java (99%) rename code/{libraries/misc/src/test/java/nu/marginalia/util => features/query-parser/src/test/java/nu/marginalia/transform_list}/TransformListTest.java (98%) create mode 100644 code/features/topic-detection/build.gradle create mode 100644 code/features/topic-detection/readme.md rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/topic-detection/src/main/java/nu/marginalia}/topic/RecipeDetector.java (99%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/topic-detection/src/main/java/nu/marginalia}/topic/TextileCraftDetector.java (99%) rename code/{crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic => features/topic-detection/src/main/java/nu/marginalia}/topic/WoodworkingDetector.java (99%) rename code/libraries/{misc => braille-block-punch-cards}/build.gradle (91%) create mode 100644 code/libraries/braille-block-punch-cards/readme.md rename code/{common/model/src/main/java/nu/marginalia/util => libraries/braille-block-punch-cards/src/main/java/nu/marginalia/bbpc}/BrailleBlockPunchCards.java (98%) rename code/{common/model/src/test/java/nu/marginalia/util => libraries/braille-block-punch-cards/src/test/java/nu/marginalia/bbpc}/BrailleBlockPunchCardsTest.java (91%) rename code/{common/model/src/main/java/nu/marginalia/util => libraries/language-processing/src/main/java/nu/marginalia/language/statistics}/DenseBitMap.java (97%) rename code/{common/model/src/test/java/nu/marginalia/util => libraries/language-processing/src/test/java/nu/marginalia/language/statistics}/DenseBitMapTest.java (88%) delete mode 100644 code/libraries/misc/src/test/java/nu/marginalia/test/TestUtil.java create mode 100644 code/libraries/next-prime/build.gradle create mode 100644 code/libraries/next-prime/readme.md rename code/libraries/{misc/src/main/java/nu/marginalia/util/PrimeUtil.java => next-prime/src/main/java/nu/marginalia/util/NextPrimeUtil.java} (93%) create mode 100644 code/libraries/next-prime/src/test/java/nu/marginalia/util/NextPrimeUtilTest.java delete mode 100644 code/services-core/index-service/src/test/java/nu/marginalia/index/service/util/PrimeUtilTest.java rename {code/libraries/misc => other/memex}/src/main/java/nu/marginalia/util/FileSizeUtil.java (100%) diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java index d50953a2..9e08ba35 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java @@ -1,8 +1,8 @@ package nu.marginalia.index.client.model.results; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import java.util.Objects; @@ -26,7 +26,7 @@ public final class SearchResultKeywordScore { this.hasPriorityTerms = hasPriorityTerms; } - private boolean hasTermFlag(EdgePageWordFlags flag) { + private boolean hasTermFlag(WordFlags flag) { return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); } @@ -37,7 +37,7 @@ public final class SearchResultKeywordScore { sum += DocumentMetadata.decodeTopology(encodedDocMetadata); - if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) { + if (DocumentMetadata.hasFlags(encodedDocMetadata, DocumentFlags.Simple.asBit())) { sum += 20; } @@ -53,28 +53,28 @@ public final class SearchResultKeywordScore { public double termValue() { double sum = 0; - if (hasTermFlag(EdgePageWordFlags.Title)) { + if (hasTermFlag(WordFlags.Title)) { sum -= 15; } - if (hasTermFlag(EdgePageWordFlags.Site)) { + if (hasTermFlag(WordFlags.Site)) { sum -= 10; - } else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) { + } else if (hasTermFlag(WordFlags.SiteAdjacent)) { sum -= 5; } - if (hasTermFlag(EdgePageWordFlags.Subjects)) { + if (hasTermFlag(WordFlags.Subjects)) { sum -= 10; } - if (hasTermFlag(EdgePageWordFlags.NamesWords)) { + if (hasTermFlag(WordFlags.NamesWords)) { sum -= 1; } - if (hasTermFlag(EdgePageWordFlags.UrlDomain)) { + if (hasTermFlag(WordFlags.UrlDomain)) { sum -= 5; } - if (hasTermFlag(EdgePageWordFlags.UrlPath)) { + if (hasTermFlag(WordFlags.UrlPath)) { sum -= 5; } @@ -95,12 +95,12 @@ public final class SearchResultKeywordScore { } public boolean isKeywordSpecial() { - return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); + return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic); } public boolean isKeywordRegular() { return !keyword.contains(":") - && !hasTermFlag(EdgePageWordFlags.Synthetic); + && !hasTermFlag(WordFlags.Synthetic); } public long encodedWordMetadata() { diff --git a/code/common/config/build.gradle b/code/common/config/build.gradle index ab1b30b4..11fabe5b 100644 --- a/code/common/config/build.gradle +++ b/code/common/config/build.gradle @@ -14,7 +14,6 @@ java { dependencies { implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') - implementation project(':code:libraries:misc') } test { diff --git a/code/common/model/build.gradle b/code/common/model/build.gradle index a6e67963..6e105132 100644 --- a/code/common/model/build.gradle +++ b/code/common/model/build.gradle @@ -14,6 +14,7 @@ dependencies { implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') implementation project(':code:libraries:big-string') + implementation project(':code:libraries:braille-block-punch-cards') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeDomainIndexingState.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/DomainIndexingState.java similarity index 81% rename from code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeDomainIndexingState.java rename to code/common/model/src/main/java/nu/marginalia/model/crawl/DomainIndexingState.java index 448641fb..1002bbfc 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeDomainIndexingState.java +++ b/code/common/model/src/main/java/nu/marginalia/model/crawl/DomainIndexingState.java @@ -1,6 +1,6 @@ package nu.marginalia.model.crawl; -public enum EdgeDomainIndexingState { +public enum DomainIndexingState { ACTIVE("Active"), EXHAUSTED("Fully Crawled"), SPECIAL("Content is side-loaded"), @@ -12,7 +12,7 @@ public enum EdgeDomainIndexingState { public String desc; - EdgeDomainIndexingState(String desc) { + DomainIndexingState(String desc) { this.desc = desc; } } diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeContentType.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeContentType.java deleted file mode 100644 index 4d447038..00000000 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeContentType.java +++ /dev/null @@ -1,15 +0,0 @@ -package nu.marginalia.model.crawl; - - -import lombok.*; - -@AllArgsConstructor -@EqualsAndHashCode -@Getter -@Setter -@Builder -@ToString -public class EdgeContentType { - public final String contentType; - public final String charset; -} diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeUrlState.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/UrlIndexingState.java similarity index 81% rename from code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeUrlState.java rename to code/common/model/src/main/java/nu/marginalia/model/crawl/UrlIndexingState.java index 07802e5c..f4312480 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeUrlState.java +++ b/code/common/model/src/main/java/nu/marginalia/model/crawl/UrlIndexingState.java @@ -1,7 +1,7 @@ package nu.marginalia.model.crawl; /** This should correspond to EC_URL.STATE */ -public enum EdgeUrlState { +public enum UrlIndexingState { OK, REDIRECT, DEAD, diff --git a/code/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklist.java b/code/common/model/src/main/java/nu/marginalia/model/dbcommon/DomainBlacklist.java similarity index 83% rename from code/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklist.java rename to code/common/model/src/main/java/nu/marginalia/model/dbcommon/DomainBlacklist.java index f659a57a..07fe1399 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklist.java +++ b/code/common/model/src/main/java/nu/marginalia/model/dbcommon/DomainBlacklist.java @@ -5,8 +5,8 @@ import gnu.trove.set.hash.TIntHashSet; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.id.EdgeId; -@ImplementedBy(EdgeDomainBlacklistImpl.class) -public interface EdgeDomainBlacklist { +@ImplementedBy(DomainBlacklistImpl.class) +public interface DomainBlacklist { boolean isBlacklisted(int domainId); default boolean isBlacklisted(EdgeId domainId) { return isBlacklisted(domainId.id()); diff --git a/code/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklistImpl.java b/code/common/model/src/main/java/nu/marginalia/model/dbcommon/DomainBlacklistImpl.java similarity index 91% rename from code/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklistImpl.java rename to code/common/model/src/main/java/nu/marginalia/model/dbcommon/DomainBlacklistImpl.java index 053ced8e..1afe30fc 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklistImpl.java +++ b/code/common/model/src/main/java/nu/marginalia/model/dbcommon/DomainBlacklistImpl.java @@ -6,20 +6,19 @@ import com.zaxxer.hikari.HikariDataSource; import gnu.trove.set.hash.TIntHashSet; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; -import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.concurrent.TimeUnit; @Singleton -public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist { +public class DomainBlacklistImpl implements DomainBlacklist { private volatile TIntHashSet spamDomainSet = new TIntHashSet(); private final HikariDataSource dataSource; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public EdgeDomainBlacklistImpl(HikariDataSource dataSource) { + public DomainBlacklistImpl(HikariDataSource dataSource) { this.dataSource = dataSource; Schedulers.io().schedulePeriodicallyDirect(this::updateSpamList, 5, 600, TimeUnit.SECONDS); diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageDocumentFlags.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java similarity index 65% rename from code/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageDocumentFlags.java rename to code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java index 04f55edf..7528a4eb 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageDocumentFlags.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java @@ -1,8 +1,8 @@ -package nu.marginalia.model.crawl; +package nu.marginalia.model.idx; import java.util.EnumSet; -public enum EdgePageDocumentFlags { +public enum DocumentFlags { /** Simple processing was done, this document should be de-prioritized as a search result */ Simple, @@ -23,10 +23,10 @@ public enum EdgePageDocumentFlags { return (asBit() & value) > 0; } - public static EnumSet decode(long encodedValue) { - EnumSet ret = EnumSet.noneOf(EdgePageDocumentFlags.class); + public static EnumSet decode(long encodedValue) { + EnumSet ret = EnumSet.noneOf(DocumentFlags.class); - for (EdgePageDocumentFlags f : values()) { + for (DocumentFlags f : values()) { if ((encodedValue & f.asBit()) > 0) { ret.add(f); } diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java index 56bdcaf8..e290c218 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java @@ -1,6 +1,5 @@ package nu.marginalia.model.idx; -import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.crawl.PubDate; import java.util.EnumSet; @@ -44,7 +43,7 @@ public record DocumentMetadata(int rank, public DocumentMetadata() { this(defaultValue()); } - public DocumentMetadata(int topology, int year, int sets, int quality, EnumSet flags) { + public DocumentMetadata(int topology, int year, int sets, int quality, EnumSet flags) { this(0, 0, topology, year, sets, quality, encodeFlags(flags)); } @@ -58,13 +57,13 @@ public record DocumentMetadata(int rank, return new DocumentMetadata(rank, encSize, topology, year, sets, quality, flags); } - private static byte encodeFlags(Set flags) { + private static byte encodeFlags(Set flags) { byte ret = 0; for (var flag : flags) { ret |= flag.asBit(); } return ret; } - public boolean hasFlag(EdgePageDocumentFlags flag) { + public boolean hasFlag(DocumentFlags flag) { return (flags & flag.asBit()) != 0; } diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageWordFlags.java b/code/common/model/src/main/java/nu/marginalia/model/idx/WordFlags.java similarity index 63% rename from code/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageWordFlags.java rename to code/common/model/src/main/java/nu/marginalia/model/idx/WordFlags.java index 4f0188d1..dec7437e 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageWordFlags.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/WordFlags.java @@ -1,23 +1,20 @@ -package nu.marginalia.model.crawl; +package nu.marginalia.model.idx; import java.util.EnumSet; -public enum EdgePageWordFlags { +public enum WordFlags { /** Word appears in title */ Title, - /** Word appears to be the subject in several sentences - * @see SubjectCounter */ + /** Word appears to be the subject in several sentences */ Subjects, - /** Word has high tf-idf - * @see KeywordCounter */ + /** Word has high tf-idf */ TfIdfHigh, - /** Word is a likely named object. This is a weaker version of Subjects. - * @see NameCounter */ + /** Word is a likely named object. This is a weaker version of Subjects. */ NamesWords, /** The word isn't actually a word on page, but a fake keyword from the code @@ -26,12 +23,10 @@ public enum EdgePageWordFlags { Synthetic, /** Word is important to site - * @see SiteWords */ Site, /** Word is important to adjacent documents - * @see SiteWords * */ SiteAdjacent, @@ -54,10 +49,10 @@ public enum EdgePageWordFlags { return (asBit() & value) > 0; } - public static EnumSet decode(long encodedValue) { - EnumSet ret = EnumSet.noneOf(EdgePageWordFlags.class); + public static EnumSet decode(long encodedValue) { + EnumSet ret = EnumSet.noneOf(WordFlags.class); - for (EdgePageWordFlags f : values()) { + for (WordFlags f : values()) { if ((encodedValue & f.asBit()) > 0) { ret.add(f); } diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java index 8c0be42b..511563f3 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java @@ -1,7 +1,7 @@ package nu.marginalia.model.idx; -import nu.marginalia.model.crawl.EdgePageWordFlags; -import nu.marginalia.util.BrailleBlockPunchCards; + +import nu.marginalia.bbpc.BrailleBlockPunchCards; import java.util.EnumSet; import java.util.Set; @@ -39,12 +39,12 @@ public record WordMetadata(int tfIdf, public WordMetadata(int tfIdf, int positions, - Set flags) + Set flags) { this(tfIdf, positions, encodeFlags(flags)); } - private static byte encodeFlags(Set flags) { + private static byte encodeFlags(Set flags) { byte ret = 0; for (var flag : flags) { ret |= flag.asBit(); } return ret; @@ -64,7 +64,7 @@ public record WordMetadata(int tfIdf, return (meta >>> TF_IDF_SHIFT) & TF_IDF_MASK; } - public boolean hasFlag(EdgePageWordFlags flag) { + public boolean hasFlag(WordFlags flag) { return (flags & flag.asBit()) != 0; } @@ -98,7 +98,7 @@ public record WordMetadata(int tfIdf, } - public EnumSet flagSet() { - return EdgePageWordFlags.decode(flags); + public EnumSet flagSet() { + return WordFlags.decode(flags); } } diff --git a/code/common/model/src/main/java/nu/marginalia/util/QueryParams.java b/code/common/model/src/main/java/nu/marginalia/util/QueryParams.java index 430758fd..160ce47f 100644 --- a/code/common/model/src/main/java/nu/marginalia/util/QueryParams.java +++ b/code/common/model/src/main/java/nu/marginalia/util/QueryParams.java @@ -11,7 +11,6 @@ import java.util.regex.Pattern; public class QueryParams { - private static final Pattern paramSplitterPattern = Pattern.compile("&"); @Nullable public static String queryParamsSanitizer(String path, @Nullable String queryParams) { diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/model/DocumentMetadataTest.java b/code/common/model/src/test/java/nu/marginalia/model/DocumentMetadataTest.java similarity index 95% rename from code/services-core/index-service/src/test/java/nu/marginalia/index/model/DocumentMetadataTest.java rename to code/common/model/src/test/java/nu/marginalia/model/DocumentMetadataTest.java index a6dbfb9d..012794f4 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/model/DocumentMetadataTest.java +++ b/code/common/model/src/test/java/nu/marginalia/model/DocumentMetadataTest.java @@ -1,7 +1,7 @@ -package nu.marginalia.index.model; +package nu.marginalia.model; -import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import org.junit.jupiter.api.Test; @@ -67,7 +67,7 @@ class DocumentMetadataTest { @Test public void encRank() { - var meta = new DocumentMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class)) + var meta = new DocumentMetadata(5, 22, 3, 8, EnumSet.noneOf(DocumentFlags.class)) .withSize(0xffffffff).encode(); var enc2 = DocumentMetadata.encodeRank(meta, 83); diff --git a/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java b/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java index 6f612374..104750e8 100644 --- a/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java +++ b/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java @@ -1,6 +1,6 @@ package nu.marginalia.model; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; import org.junit.jupiter.api.Test; @@ -12,16 +12,16 @@ class WordMetadataTest { @Test public void codecTest() { - verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(EdgePageWordFlags.class))); - verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(EdgePageWordFlags.class))); - verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(EdgePageWordFlags.class))); - System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(EdgePageWordFlags.class))); - System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(EdgePageWordFlags.class))); + verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(WordFlags.class))); + verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(WordFlags.class))); + verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(WordFlags.class))); + System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(WordFlags.class))); + System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(WordFlags.class))); } @Test public void testClampTfIdfLow() { - var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class)); + var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(WordFlags.class)); var encoded = new WordMetadata(original.encode()); assertEquals(original.positions(), encoded.positions()); @@ -30,7 +30,7 @@ class WordMetadataTest { @Test public void testClampTfIdfHigh() { - var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class)); + var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(WordFlags.class)); var encoded = new WordMetadata(original.encode()); assertEquals(original.positions(), encoded.positions()); diff --git a/code/common/service/build.gradle b/code/common/service/build.gradle index 2c7a3942..bad65877 100644 --- a/code/common/service/build.gradle +++ b/code/common/service/build.gradle @@ -12,7 +12,6 @@ java { dependencies { implementation project(':code:common:service-client') implementation project(':code:common:service-discovery') - implementation project(':code:libraries:misc') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeHtmlStandard.java b/code/crawl/common/src/main/java/nu/marginalia/crawling/common/model/HtmlStandard.java similarity index 79% rename from code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeHtmlStandard.java rename to code/crawl/common/src/main/java/nu/marginalia/crawling/common/model/HtmlStandard.java index 17788cca..6539dfe1 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/EdgeHtmlStandard.java +++ b/code/crawl/common/src/main/java/nu/marginalia/crawling/common/model/HtmlStandard.java @@ -1,7 +1,7 @@ -package nu.marginalia.model.crawl; +package nu.marginalia.crawling.common.model; -public enum EdgeHtmlStandard { +public enum HtmlStandard { PLAIN(0, 1, 1993), UNKNOWN(0, 1, 2000), HTML123(0, 1, 1997), @@ -18,7 +18,7 @@ public enum EdgeHtmlStandard { * */ public final int yearGuess; - EdgeHtmlStandard(double offset, double scale, int yearGuess) { + HtmlStandard(double offset, double scale, int yearGuess) { this.offset = offset; this.scale = scale; this.yearGuess = yearGuess; diff --git a/code/crawl/converting-model/build.gradle b/code/crawl/converting-model/build.gradle index 24678acb..15382ff2 100644 --- a/code/crawl/converting-model/build.gradle +++ b/code/crawl/converting-model/build.gradle @@ -18,6 +18,8 @@ dependencies { implementation project(':code:common:service-client') implementation project(':code:libraries:language-processing') + implementation project(':code:crawl:common') + implementation libs.lombok annotationProcessor libs.lombok implementation libs.bundles.slf4j diff --git a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java index 7a2721fb..45da80ca 100644 --- a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java +++ b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java @@ -2,7 +2,7 @@ package nu.marginalia.converting.instruction; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.model.DocumentKeywords; import nu.marginalia.converting.instruction.instructions.DomainLink; @@ -15,7 +15,7 @@ public interface Interpreter { void loadRssFeed(EdgeUrl[] rssFeed); void loadDomainLink(DomainLink[] links); - void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip); + void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip); void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); diff --git a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java index 6c56a100..e03593e9 100644 --- a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java +++ b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.model.crawl.EdgeHtmlStandard; -import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.crawling.common.model.HtmlStandard; +import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.InstructionTag; import nu.marginalia.converting.instruction.Interpreter; @@ -10,11 +10,11 @@ import org.jetbrains.annotations.Nullable; public record LoadProcessedDocument(EdgeUrl url, - EdgeUrlState state, + UrlIndexingState state, String title, String description, int htmlFeatures, - EdgeHtmlStandard standard, + HtmlStandard standard, int length, long hash, double quality, diff --git a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java index b798ac49..28e42f5d 100644 --- a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java +++ b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java @@ -1,6 +1,6 @@ package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.InstructionTag; import nu.marginalia.converting.instruction.Interpreter; @@ -8,7 +8,7 @@ import nu.marginalia.model.EdgeUrl; public record LoadProcessedDocumentWithError(EdgeUrl url, - EdgeUrlState state, + UrlIndexingState state, String reason) implements Instruction { @Override diff --git a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java index b7784a2b..1186c38d 100644 --- a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java +++ b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java @@ -1,12 +1,12 @@ package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.InstructionTag; import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.model.EdgeDomain; -public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction { +public record LoadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) implements Instruction { @Override public void apply(Interpreter interpreter) { diff --git a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywordsBuilder.java b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywordsBuilder.java index fc8dcfea..f166fe44 100644 --- a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywordsBuilder.java +++ b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywordsBuilder.java @@ -3,7 +3,7 @@ package nu.marginalia.converting.model; import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; import lombok.Getter; import lombok.ToString; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import java.util.*; @@ -54,14 +54,14 @@ public class DocumentKeywordsBuilder { words.putIfAbsent(word, 0); } - public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set flagWords) { + public void setFlagOnMetadataForWords(WordFlags flag, Set flagWords) { flagWords.forEach(word -> words.mergeLong(word, flag.asBit(), (a, b) -> a|b) ); } public void addAllSyntheticTerms(Collection newWords) { - long meta = EdgePageWordFlags.Synthetic.asBit(); + long meta = WordFlags.Synthetic.asBit(); newWords.forEach(word -> { words.putIfAbsent(word, meta); diff --git a/code/crawl/converting-process/build.gradle b/code/crawl/converting-process/build.gradle index 7bb5a209..805a5bc8 100644 --- a/code/crawl/converting-process/build.gradle +++ b/code/crawl/converting-process/build.gradle @@ -20,20 +20,27 @@ tasks.distZip.enabled = false dependencies { implementation project(':third-party') + implementation project(':code:api:index-api') + implementation project(':code:common:model') implementation project(':code:common:service') implementation project(':code:common:config') + implementation project(':code:common:service-discovery') + implementation project(':code:common:service-client') + implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:big-string') - implementation project(':code:api:index-api') - implementation project(':code:common:service-discovery') - implementation project(':code:common:service-client') implementation project(':code:libraries:language-processing') + implementation project(':code:crawl:common') implementation project(':code:crawl:converting-model') implementation project(':code:crawl:crawling-model') + implementation project(':code:features:adblock') + implementation project(':code:features:pubdate') + implementation project(':code:features:topic-detection') + implementation libs.lombok annotationProcessor libs.lombok implementation libs.bundles.slf4j diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java index b2eda5ea..e1669255 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java @@ -1,7 +1,7 @@ package nu.marginalia.converting; import com.github.luben.zstd.ZstdOutputStream; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.model.DocumentKeywords; @@ -49,7 +49,7 @@ public class ConversionLog implements AutoCloseable, Interpreter { public void loadDomainLink(DomainLink[] links) {} @Override - public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {} + public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {} @Override public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java index 5c8e82c4..2e79237c 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java @@ -2,7 +2,7 @@ package nu.marginalia.converting; import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Interpreter; @@ -106,7 +106,7 @@ public class InstructionWriter { public void loadDomainLink(DomainLink[] links) {} @Override - public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) { + public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) { this.domainName = domain.toString(); } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java index 4b0fbb36..3a9de512 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java @@ -1,8 +1,8 @@ package nu.marginalia.converting.model; import lombok.ToString; -import nu.marginalia.model.crawl.EdgePageDocumentFlags; -import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.model.EdgeUrl; import java.util.OptionalDouble; @@ -14,11 +14,11 @@ public class ProcessedDocument { public ProcessedDocumentDetails details; public DocumentKeywordsBuilder words; - public EdgeUrlState state; + public UrlIndexingState state; public String stateReason; public boolean isOk() { - return EdgeUrlState.OK == state; + return UrlIndexingState.OK == state; } public boolean isProcessedFully() { @@ -28,7 +28,7 @@ public class ProcessedDocument { if (details == null) return false; - return !details.metadata.hasFlag(EdgePageDocumentFlags.Simple); + return !details.metadata.hasFlag(DocumentFlags.Simple); } public OptionalDouble quality() { diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java index b6cd4d56..5c395e31 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.model; import lombok.ToString; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.EdgeUrl; @@ -23,7 +23,7 @@ public class ProcessedDocumentDetails { public long hashCode; public Set features; - public EdgeHtmlStandard standard; + public HtmlStandard standard; public List linksInternal; public List linksExternal; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java index f3a08b98..95b66a02 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java @@ -2,7 +2,7 @@ package nu.marginalia.converting.model; import lombok.ToString; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import java.util.List; import java.util.Optional; @@ -13,7 +13,7 @@ public class ProcessedDomain { public EdgeDomain domain; public List documents; - public EdgeDomainIndexingState state; + public DomainIndexingState state; public EdgeDomain redirect; public String ip; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index b7072236..3a2728a4 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -4,7 +4,7 @@ import com.google.inject.Inject; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin; @@ -45,12 +45,12 @@ public class DocumentProcessor { processDocument(crawledDocument, crawledDomain, ret); } catch (DisqualifiedException ex) { - ret.state = EdgeUrlState.DISQUALIFIED; + ret.state = UrlIndexingState.DISQUALIFIED; ret.stateReason = ex.reason.toString(); logger.debug("Disqualified {}: {}", ret.url, ex.reason); } catch (Exception ex) { - ret.state = EdgeUrlState.DISQUALIFIED; + ret.state = UrlIndexingState.DISQUALIFIED; ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString(); logger.info("Failed to convert " + crawledDocument.url, ex); ex.printStackTrace(); @@ -125,11 +125,11 @@ public class DocumentProcessor { return false; } - private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) { + private UrlIndexingState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) { return switch (CrawlerDocumentStatus.valueOf(crawlerStatus)) { - case OK -> httpStatus < 300 ? EdgeUrlState.OK : EdgeUrlState.DEAD; - case REDIRECT -> EdgeUrlState.REDIRECT; - default -> EdgeUrlState.DEAD; + case OK -> httpStatus < 300 ? UrlIndexingState.OK : UrlIndexingState.DEAD; + case REDIRECT -> UrlIndexingState.REDIRECT; + default -> UrlIndexingState.DEAD; }; } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index c3c32130..161f78ca 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -6,11 +6,10 @@ import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.CrawlerDomainStatus; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.util.StringPool; import nu.marginalia.converting.processor.logic.links.InternalLinkGraph; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; @@ -134,12 +133,12 @@ public class DomainProcessor { } } - private EdgeDomainIndexingState getState(String crawlerStatus) { + private DomainIndexingState getState(String crawlerStatus) { return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) { - case OK -> EdgeDomainIndexingState.ACTIVE; - case REDIRECT -> EdgeDomainIndexingState.REDIR; - case BLOCKED -> EdgeDomainIndexingState.BLOCKED; - default -> EdgeDomainIndexingState.ERROR; + case OK -> DomainIndexingState.ACTIVE; + case REDIRECT -> DomainIndexingState.REDIR; + case BLOCKED -> DomainIndexingState.BLOCKED; + default -> DomainIndexingState.ERROR; }; } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java index 6c16c068..afbd0ab3 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.processor; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeUrl; @@ -24,7 +24,7 @@ public class SiteWords { Map> linkedKeywords = getAdjacentWords(internalLinkGraph); for (var doc : processedDomain.documents) { - applyKeywordsToDoc(doc, EdgePageWordFlags.SiteAdjacent, linkedKeywords.get(doc.url)); + applyKeywordsToDoc(doc, WordFlags.SiteAdjacent, linkedKeywords.get(doc.url)); } } @@ -33,17 +33,17 @@ public class SiteWords { Set commonSiteWords = new HashSet<>(10); commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, - EdgePageWordFlags.Subjects)); + WordFlags.Subjects)); commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, - EdgePageWordFlags.Title)); + WordFlags.Title)); if (commonSiteWords.isEmpty()) { return; } for (var doc : processedDomain.documents) { - applyKeywordsToDoc(doc, EdgePageWordFlags.Site, commonSiteWords); + applyKeywordsToDoc(doc, WordFlags.Site, commonSiteWords); } } @@ -74,7 +74,7 @@ public class SiteWords { return linkedKeywords; } - private void applyKeywordsToDoc(ProcessedDocument doc, EdgePageWordFlags flag, Set words) { + private void applyKeywordsToDoc(ProcessedDocument doc, WordFlags flag, Set words) { if (doc.words != null && words != null) { doc.words.setFlagOnMetadataForWords(flag, words); } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SimpleKeywords.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SimpleKeywords.java index 92658991..456dba52 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SimpleKeywords.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SimpleKeywords.java @@ -7,7 +7,7 @@ import nu.marginalia.language.keywords.KeywordExtractor; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.KeywordMetadata; import nu.marginalia.language.model.WordRep; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import java.util.EnumSet; @@ -22,7 +22,7 @@ public class SimpleKeywords { KeywordMetadata metadata, DocumentLanguageData documentLanguageData) { - EnumSet flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class); + EnumSet flagsTemplate = EnumSet.noneOf(WordFlags.class); for (var sent : documentLanguageData.sentences) { diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 36d4f3b0..ba344226 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.logic; import crawlercommons.utils.Strings; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.converting.model.DisqualifiedException; import org.jsoup.nodes.Document; @@ -22,7 +22,7 @@ public class DocumentValuator { ); - public double getQuality(CrawledDocument crawledDocument, EdgeHtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException { + public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException { double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count(); double scriptPenalty = getScriptPenalty(parsedDocument); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 0ff35164..fea500d3 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -2,10 +2,14 @@ package nu.marginalia.converting.processor.logic; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.adblock.AdblockSimulator; +import nu.marginalia.adblock.GoogleAnwersSpamDetector; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.converting.processor.logic.topic.*; +import nu.marginalia.topic.RecipeDetector; +import nu.marginalia.topic.TextileCraftDetector; +import nu.marginalia.topic.WoodworkingDetector; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java index 5179274c..a3ee8d22 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.processor.logic; import com.google.common.base.Strings; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import org.jsoup.nodes.Document; import org.jsoup.nodes.DocumentType; import org.slf4j.Logger; @@ -12,53 +12,53 @@ public class HtmlStandardExtractor { private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class); - public static EdgeHtmlStandard parseDocType(DocumentType docType) { + public static HtmlStandard parseDocType(DocumentType docType) { if (null == docType) { - return EdgeHtmlStandard.UNKNOWN; + return HtmlStandard.UNKNOWN; } String publicId = docType.publicId(); if (Strings.isNullOrEmpty(publicId)) - return EdgeHtmlStandard.HTML5; + return HtmlStandard.HTML5; publicId = publicId.toUpperCase(); if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) { - return EdgeHtmlStandard.HTML4; + return HtmlStandard.HTML4; } if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) { - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; } if (publicId.startsWith("-//INTERNET/RFC XXXX//EN")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-//NETSCAPE COMM. CORP")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-//SQ//DTD HTML 2")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-//W3O//DTD W3 HTML 2")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-//IETF//DTD HTML 2")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-//IETF//DTD HTML//EN")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-/W3C//DTD HTML 3")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-/W3C/DTD HTML 3")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-//IETF//DTD HTML 3")) - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; if (publicId.startsWith("-//W3C//DTD XHTML")) - return EdgeHtmlStandard.XHTML; + return HtmlStandard.XHTML; if (publicId.startsWith("ISO/IEC 15445:2000//DTD")) - return EdgeHtmlStandard.XHTML; + return HtmlStandard.XHTML; if (publicId.startsWith("-//W3C//DTD HTML")) - return EdgeHtmlStandard.HTML4; + return HtmlStandard.HTML4; logger.debug("Unknown publicID standard {}", publicId); - return EdgeHtmlStandard.UNKNOWN; + return HtmlStandard.UNKNOWN; } - public static EdgeHtmlStandard sniffHtmlStandard(Document parsed) { + public static HtmlStandard sniffHtmlStandard(Document parsed) { int html4Attributes = 0; int html5Attributes = 0; @@ -72,11 +72,11 @@ public class HtmlStandardExtractor { html4Attributes++; } if (html5Attributes > 0) { - return EdgeHtmlStandard.HTML5; + return HtmlStandard.HTML5; } if (html4Attributes > 0) { - return EdgeHtmlStandard.HTML4; + return HtmlStandard.HTML4; } - return EdgeHtmlStandard.HTML123; + return HtmlStandard.HTML123; } } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java index 4cb4d96d..b45efb4a 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.processor.logic; import com.google.inject.Singleton; -import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.lsh.EasyLSH; import org.slf4j.Logger; @@ -52,7 +52,7 @@ public class LshDocumentDeduplicator { { logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url); - otherDoc.state = EdgeUrlState.DISQUALIFIED; + otherDoc.state = UrlIndexingState.DISQUALIFIED; otherDoc.stateReason = "Duplicate"; return true; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java index 3172252d..e4c6e6f0 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.processor.logic.links; import ca.rmen.porterstemmer.PorterStemmer; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.converting.model.ProcessedDomain; import java.util.*; @@ -16,7 +16,7 @@ public class CommonKeywordExtractor { private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5; - public List getCommonSiteWords(ProcessedDomain ret, EdgePageWordFlags... flags) { + public List getCommonSiteWords(ProcessedDomain ret, WordFlags... flags) { if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS) return Collections.emptyList(); @@ -27,7 +27,7 @@ public class CommonKeywordExtractor { final Map> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10); int qualifiedDocCount = 0; - long wordFlags = Arrays.stream(flags).mapToInt(EdgePageWordFlags::asBit).reduce(0, (a,b) -> a|b); + long wordFlags = Arrays.stream(flags).mapToInt(WordFlags::asBit).reduce(0, (a, b) -> a|b); for (var doc : ret.documents) { if (doc.words == null) continue; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java index 4933e074..175741bd 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java @@ -1,6 +1,6 @@ package nu.marginalia.converting.processor.logic.links; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.EdgeUrl; @@ -22,7 +22,7 @@ public class InternalLinkGraph { internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal)); knownUrls.addAll(doc.details.linksInternal); - List topKeywords = doc.words.getWordsWithAnyFlag(EdgePageWordFlags.TfIdfHigh.asBit() | EdgePageWordFlags.Subjects.asBit()); + List topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.TfIdfHigh.asBit() | WordFlags.Subjects.asBit()); topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords)); candidateKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords)); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateEffortLevel.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateEffortLevel.java deleted file mode 100644 index a69373bb..00000000 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateEffortLevel.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.converting.processor.logic.pubdate; - -public enum PubDateEffortLevel { - LOW, - HIGH -} diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java deleted file mode 100644 index da44e3fa..00000000 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java +++ /dev/null @@ -1,23 +0,0 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; - -import nu.marginalia.model.crawl.EdgeHtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.model.EdgeUrl; -import org.jsoup.nodes.Document; - -import java.util.Optional; - -public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic { - - @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { - if (htmlStandard == EdgeHtmlStandard.UNKNOWN) - return Optional.empty(); - - return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard))); - } - -} diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index 107b73f9..cc547972 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -4,7 +4,7 @@ import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.converting.model.DocumentKeywordsBuilder; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.converting.model.DisqualifiedException; @@ -56,7 +56,7 @@ public abstract class AbstractDocumentProcessorPlugin { return this; } - public MetaTagsBuilder addFormat(EdgeHtmlStandard standard) { + public MetaTagsBuilder addFormat(HtmlStandard standard) { tagWords.add("format:"+standard.toString().toLowerCase()); return this; } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 6540aef2..d6093865 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -7,23 +7,22 @@ import nu.marginalia.converting.processor.logic.summary.SummaryExtractor; import nu.marginalia.crawling.common.link.LinkParser; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.language.model.KeywordMetadata; import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.model.crawl.EdgeHtmlStandard; -import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.crawling.common.model.HtmlStandard; +import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.converting.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.converting.processor.logic.*; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateSniffer; import nu.marginalia.gregex.GuardedRegex; import nu.marginalia.gregex.GuardedRegexFactory; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.pubdate.PubDateSniffer; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -120,7 +119,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin ret.hashCode = dld.localitySensitiveHashCode(); PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); - ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class)); + ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(DocumentFlags.class)); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); @@ -262,10 +261,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin words.addAllSyntheticTerms(linkTerms); } - private EdgeHtmlStandard getHtmlStandard(Document doc) { - EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType()); + private HtmlStandard getHtmlStandard(Document doc) { + HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType()); - if (EdgeHtmlStandard.UNKNOWN.equals(htmlStandard)) { + if (HtmlStandard.UNKNOWN.equals(htmlStandard)) { return HtmlStandardExtractor.sniffHtmlStandard(doc); } return htmlStandard; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index f6b7bf43..c42b88e1 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -4,11 +4,10 @@ import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.language.model.KeywordMetadata; import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.model.crawl.EdgeHtmlStandard; -import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.crawling.common.model.HtmlStandard; +import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.converting.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.crawl.PubDate; @@ -16,7 +15,7 @@ import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.processor.logic.PlainTextLogic; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.util.LineUtils; +import nu.marginalia.converting.util.LineUtils; import org.apache.commons.lang3.StringUtils; import java.net.URISyntaxException; @@ -78,7 +77,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP List firstFewLines = LineUtils.firstNLines(documentBody, 40); ret.length = documentBody.length(); - ret.standard = EdgeHtmlStandard.PLAIN; + ret.standard = HtmlStandard.PLAIN; ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength); ret.quality = -1; @@ -89,7 +88,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1)); - ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText)); + ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(DocumentFlags.PlainText)); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); diff --git a/code/common/model/src/main/java/nu/marginalia/util/LineUtils.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/util/LineUtils.java similarity index 97% rename from code/common/model/src/main/java/nu/marginalia/util/LineUtils.java rename to code/crawl/converting-process/src/main/java/nu/marginalia/converting/util/LineUtils.java index 0bb785a0..33d4f789 100644 --- a/code/common/model/src/main/java/nu/marginalia/util/LineUtils.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/util/LineUtils.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.converting.util; import java.util.ArrayList; import java.util.List; diff --git a/code/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PlainTextLogicTest.java b/code/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PlainTextLogicTest.java index eea3668c..7ca43bc9 100644 --- a/code/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PlainTextLogicTest.java +++ b/code/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PlainTextLogicTest.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.logic; import nu.marginalia.converting.processor.logic.PlainTextLogic; -import nu.marginalia.util.LineUtils; +import nu.marginalia.converting.util.LineUtils; import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Test; diff --git a/code/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java b/code/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java index 512febde..bc348377 100644 --- a/code/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java +++ b/code/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java @@ -3,7 +3,6 @@ package nu.marginalia.converting.processor.keywords; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; import nu.marginalia.language.WordPatterns; -import nu.marginalia.language.model.KeywordMetadata; import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordSpan; import nu.marginalia.language.sentence.SentenceExtractor; @@ -12,10 +11,7 @@ import nu.marginalia.language.keywords.KeywordExtractor; import nu.marginalia.language.model.WordSeparator; import nu.marginalia.WmsaHome; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.EdgePageWordFlags; -import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.test.util.TestLanguageModels; -import org.apache.commons.lang3.tuple.Pair; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -27,7 +23,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.regex.Pattern; -import java.util.stream.IntStream; @Tag("slow") class SentenceExtractorTest { diff --git a/code/common/model/src/test/java/nu/marginalia/util/LineUtilsTest.java b/code/crawl/converting-process/src/test/java/nu/marginalia/converting/util/LineUtilsTest.java similarity index 90% rename from code/common/model/src/test/java/nu/marginalia/util/LineUtilsTest.java rename to code/crawl/converting-process/src/test/java/nu/marginalia/converting/util/LineUtilsTest.java index e63ca38f..a5443780 100644 --- a/code/common/model/src/test/java/nu/marginalia/util/LineUtilsTest.java +++ b/code/crawl/converting-process/src/test/java/nu/marginalia/converting/util/LineUtilsTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.converting.util; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; diff --git a/code/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java b/code/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java index f0a2fda2..5dcf0056 100644 --- a/code/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java +++ b/code/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java @@ -5,9 +5,8 @@ import com.google.common.hash.Hashing; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.crawling.model.CrawlingSpecification; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.model.dbcommon.DomainBlacklistImpl; -import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; @@ -69,11 +68,11 @@ public class CrawlJobDomainExtractor { """; - private final EdgeDomainBlacklistImpl blacklist; + private final DomainBlacklistImpl blacklist; private final HikariDataSource dataSource; private static final HashFunction hasher = Hashing.murmur3_128(0); - public CrawlJobDomainExtractor(EdgeDomainBlacklistImpl blacklist, HikariDataSource dataSource) { + public CrawlJobDomainExtractor(DomainBlacklistImpl blacklist, HikariDataSource dataSource) { this.blacklist = blacklist; this.dataSource = dataSource; } diff --git a/code/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java b/code/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java index 0f53b2dc..faa4e472 100644 --- a/code/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java +++ b/code/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java @@ -2,7 +2,7 @@ package nu.marginalia.crawl; import nu.marginalia.crawling.model.CrawlingSpecification; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.model.dbcommon.DomainBlacklistImpl; import nu.marginalia.service.module.DatabaseModule; import java.io.IOException; @@ -37,7 +37,7 @@ public class CrawlJobExtractorMain { private static Stream streamSpecs(String[] targetDomains) { var ds = new DatabaseModule().provideConnection(); - var domainExtractor = new CrawlJobDomainExtractor(new EdgeDomainBlacklistImpl(ds), ds); + var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds); if (targetDomains.length > 0) { return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain); diff --git a/code/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java b/code/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java new file mode 100644 index 00000000..e8a9fca1 --- /dev/null +++ b/code/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java @@ -0,0 +1,5 @@ +package nu.marginalia.crawling.model; + + +public record ContentType(String contentType, String charset) { +} diff --git a/code/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpFetcher.java b/code/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpFetcher.java index 141d7970..642322b0 100644 --- a/code/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpFetcher.java +++ b/code/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpFetcher.java @@ -9,7 +9,7 @@ import lombok.SneakyThrows; import lombok.ToString; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.model.crawl.EdgeContentType; +import nu.marginalia.crawling.model.ContentType; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.bigstring.BigString; @@ -257,11 +257,11 @@ public class HttpFetcher { byte[] data = byteStream.readNBytes(maxFetchSize); var contentType = ContentTypeParser.parse(contentTypeHeader, data); - if (!contentTypeLogic.isAllowableContentType(contentType.contentType)) { + if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); } - if ("Shift_JIS".equalsIgnoreCase(contentType.charset)) { + if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, ""); } @@ -280,10 +280,10 @@ public class HttpFetcher { .build(); } - private String getStringData(byte[] data, EdgeContentType contentType) { + private String getStringData(byte[] data, ContentType contentType) { Charset charset; try { - charset = Charset.forName(contentType.charset); + charset = Charset.forName(contentType.charset()); } catch (IllegalCharsetNameException ex) { charset = StandardCharsets.UTF_8; diff --git a/code/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java b/code/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java index 62d21ba9..604264e3 100644 --- a/code/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java +++ b/code/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java @@ -1,7 +1,7 @@ package nu.marginalia.crawl.retreival.logic; import crawlercommons.mimetypes.MimeTypeDetector; -import nu.marginalia.model.crawl.EdgeContentType; +import nu.marginalia.crawling.model.ContentType; import org.jsoup.Jsoup; import java.util.Arrays; @@ -11,25 +11,25 @@ public class ContentTypeParser { static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector(); - public static EdgeContentType parse(String contentType, byte[] data) { + public static ContentType parse(String contentType, byte[] data) { return getContentTypeFromContentTypeString(contentType) .or(() -> getContentTypeStringFromTag(data)) .orElseGet(() -> { Optional charset = getCharsetFromTag(data); - return new EdgeContentType( + return new ContentType( Optional.ofNullable(contentType) .or(() -> Optional.ofNullable(mimeTypeDetector.detect(data))) .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1")); }); } - private static Optional getContentTypeFromContentTypeString(String contentType) { + private static Optional getContentTypeFromContentTypeString(String contentType) { if (contentType != null && contentType.contains(";")) { var parts = contentType.split(";"); var content = parts[0].trim(); var extra = parts[1].trim(); if (extra.startsWith("charset=")) { - return Optional.of(new EdgeContentType(content, extra.substring("charset=".length()))); + return Optional.of(new ContentType(content, extra.substring("charset=".length()))); } } return Optional.empty(); @@ -53,7 +53,7 @@ public class ContentTypeParser { } - private static Optional getContentTypeStringFromTag(byte[] data) { + private static Optional getContentTypeStringFromTag(byte[] data) { String header = new String(Arrays.copyOf(data, Math.min(1024, data.length))); var doc = Jsoup.parse(header); for (var metaTag : doc.getElementsByTag("meta")) { diff --git a/code/crawl/experimental/build.gradle b/code/crawl/experimental/build.gradle index 52939ce0..cfa239b2 100644 --- a/code/crawl/experimental/build.gradle +++ b/code/crawl/experimental/build.gradle @@ -24,6 +24,8 @@ dependencies { implementation project(':code:crawl:common') implementation project(':code:crawl:crawling-model') implementation project(':code:crawl:converting-process') + implementation project(':code:features:adblock') + implementation project(':code:features:topic-detection') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/crawl/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java b/code/crawl/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java index 61f91e52..e34d58f0 100644 --- a/code/crawl/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java +++ b/code/crawl/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java @@ -1,7 +1,7 @@ package nu.marginalia.experimental; +import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.converting.processor.DocumentProcessor; -import nu.marginalia.converting.processor.logic.topic.AdblockSimulator; import nu.marginalia.crawling.common.plan.CrawlPlanLoader; import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; import nu.marginalia.crawling.model.CrawledDocument; diff --git a/code/crawl/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java b/code/crawl/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java index f9d15b81..9300a7d6 100644 --- a/code/crawl/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java +++ b/code/crawl/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java @@ -10,10 +10,10 @@ import nu.marginalia.converting.processor.logic.DomPruningFilter; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.converting.processor.logic.topic.GoogleAnwersSpamDetector; -import nu.marginalia.converting.processor.logic.topic.RecipeDetector; -import nu.marginalia.converting.processor.logic.topic.TextileCraftDetector; -import nu.marginalia.converting.processor.logic.topic.WoodworkingDetector; +import nu.marginalia.adblock.GoogleAnwersSpamDetector; +import nu.marginalia.topic.RecipeDetector; +import nu.marginalia.topic.TextileCraftDetector; +import nu.marginalia.topic.WoodworkingDetector; import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/crawl/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java b/code/crawl/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java index a4177562..c1140986 100644 --- a/code/crawl/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java +++ b/code/crawl/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java @@ -1,8 +1,8 @@ package nu.marginalia.experimental; import lombok.SneakyThrows; +import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.converting.processor.DocumentProcessor; -import nu.marginalia.converting.processor.logic.topic.AdblockSimulator; import nu.marginalia.crawling.common.plan.CrawlPlanLoader; import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; import nu.marginalia.crawling.model.CrawledDocument; diff --git a/code/crawl/loading-process/build.gradle b/code/crawl/loading-process/build.gradle index 6771b5d3..bf93444a 100644 --- a/code/crawl/loading-process/build.gradle +++ b/code/crawl/loading-process/build.gradle @@ -28,7 +28,6 @@ dependencies { implementation project(':code:index:lexicon') implementation project(':code:index:index-journal') implementation project(':code:libraries:language-processing') - implementation project(':code:libraries:misc') testImplementation project(':code:services-core:search-service') diff --git a/code/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index 08940e49..ebe20ca9 100644 --- a/code/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ b/code/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -2,7 +2,7 @@ package nu.marginalia.loading.loader; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.model.DocumentKeywords; @@ -76,7 +76,7 @@ public class Loader implements Interpreter { } @Override - public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) { + public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) { sqlLoadProcessedDomain.load(data, domain, state, ip); } diff --git a/code/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java b/code/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java index 3b2304a9..dd1f5e4c 100644 --- a/code/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java +++ b/code/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java @@ -2,7 +2,7 @@ package nu.marginalia.loading.loader; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; @@ -42,7 +42,7 @@ public class SqlLoadProcessedDomain { } } - public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) { + public void load(LoaderData data, EdgeDomain domain, DomainIndexingState state, String ip) { data.setTargetDomain(domain); loadDomains.load(data, domain); diff --git a/code/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java b/code/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java index 51752127..1b3afa82 100644 --- a/code/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java +++ b/code/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java @@ -9,8 +9,8 @@ import nu.marginalia.loading.loader.SqlLoadProcessedDocument; import nu.marginalia.loading.loader.SqlLoadUrls; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.EdgeHtmlStandard; -import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.crawling.common.model.HtmlStandard; +import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.id.EdgeIdArray; import org.junit.jupiter.api.*; @@ -69,11 +69,11 @@ class SqlLoadProcessedDocumentTest { loader.load(loaderData, List.of(new LoadProcessedDocument( url, - EdgeUrlState.OK, + UrlIndexingState.OK, "TITLE", "DESCR", HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)), - EdgeHtmlStandard.HTML5, + HtmlStandard.HTML5, 100, 12345, -3.14, diff --git a/code/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java b/code/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java index 82f38c23..e4051790 100644 --- a/code/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java +++ b/code/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java @@ -6,7 +6,7 @@ import nu.marginalia.loading.loader.SqlLoadDomains; import nu.marginalia.loading.loader.SqlLoadProcessedDomain; import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; @@ -48,7 +48,7 @@ class SqlLoadProcessedDomainTest { @Test public void loadProcessedDomain() { var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); - loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1"); + loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); } @Test public void loadDomainAlias() { diff --git a/code/features/adblock/build.gradle b/code/features/adblock/build.gradle new file mode 100644 index 00000000..06a13a9a --- /dev/null +++ b/code/features/adblock/build.gradle @@ -0,0 +1,41 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id "de.undercouch.download" version "5.1.0" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':code:common:config') + + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.bundles.slf4j + implementation libs.guice + implementation libs.notnull + implementation libs.jsoup + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/features/adblock/readme.md b/code/features/adblock/readme.md new file mode 100644 index 00000000..1df54936 --- /dev/null +++ b/code/features/adblock/readme.md @@ -0,0 +1,8 @@ +# Adblock + +Contains an adblock simulator that reads an adblock specifications file and +uses it to identify if a document has ads. + +## Central Classes + +* [AdblockSimulator](src/main/java/nu/marginalia/adblock/AdblockSimulator.java) \ No newline at end of file diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/AdblockSimulator.java b/code/features/adblock/src/main/java/nu/marginalia/adblock/AdblockSimulator.java similarity index 98% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/AdblockSimulator.java rename to code/features/adblock/src/main/java/nu/marginalia/adblock/AdblockSimulator.java index 62c4b778..2fabcfa3 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/AdblockSimulator.java +++ b/code/features/adblock/src/main/java/nu/marginalia/adblock/AdblockSimulator.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic.topic; +package nu.marginalia.adblock; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/GoogleAnwersSpamDetector.java b/code/features/adblock/src/main/java/nu/marginalia/adblock/GoogleAnwersSpamDetector.java similarity index 93% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/GoogleAnwersSpamDetector.java rename to code/features/adblock/src/main/java/nu/marginalia/adblock/GoogleAnwersSpamDetector.java index dc0c4eed..4cec3700 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/GoogleAnwersSpamDetector.java +++ b/code/features/adblock/src/main/java/nu/marginalia/adblock/GoogleAnwersSpamDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic.topic; +package nu.marginalia.adblock; import org.jsoup.nodes.Document; diff --git a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java index 6d13fd09..b4fa8abd 100644 --- a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java +++ b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java @@ -2,7 +2,7 @@ package nu.marginalia.ranking.data; import lombok.AllArgsConstructor; import lombok.Data; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; @Data @AllArgsConstructor @@ -10,7 +10,7 @@ public class RankingDomainData { public final int id; public final String name; private int alias; - public EdgeDomainIndexingState state; + public DomainIndexingState state; public final int knownUrls; public int resolveAlias() { @@ -23,10 +23,10 @@ public class RankingDomainData { } public boolean isSpecial() { - return EdgeDomainIndexingState.SPECIAL == state; + return DomainIndexingState.SPECIAL == state; } public boolean isSocialMedia() { - return EdgeDomainIndexingState.SOCIAL_MEDIA == state; + return DomainIndexingState.SOCIAL_MEDIA == state; } } diff --git a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java index a330ede7..7f577f3b 100644 --- a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java +++ b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java @@ -3,8 +3,8 @@ package nu.marginalia.ranking.data; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.dbcommon.DomainBlacklistImpl; +import nu.marginalia.model.crawl.DomainIndexingState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -15,13 +15,13 @@ import java.util.function.IntConsumer; @Singleton public class RankingDomainFetcher { protected final HikariDataSource dataSource; - protected final EdgeDomainBlacklistImpl blacklist; + protected final DomainBlacklistImpl blacklist; protected final Logger logger = LoggerFactory.getLogger(getClass()); protected boolean getNames = false; @Inject - public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { + public RankingDomainFetcher(HikariDataSource dataSource, DomainBlacklistImpl blacklist) { this.dataSource = dataSource; this.blacklist = blacklist; } @@ -66,7 +66,7 @@ public class RankingDomainFetcher { new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), - EdgeDomainIndexingState.valueOf(rsp.getString(4)), + DomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5))); } } diff --git a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java index 738ecb55..0bfff828 100644 --- a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java +++ b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java @@ -3,7 +3,7 @@ package nu.marginalia.ranking.data; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.model.dbcommon.DomainBlacklistImpl; import org.slf4j.LoggerFactory; import java.sql.SQLException; @@ -14,7 +14,7 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher final boolean hasData; @Inject - public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { + public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, DomainBlacklistImpl blacklist) { super(dataSource, blacklist); hasData = isDomainNeighborTablePopulated(dataSource); diff --git a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java index 4ff472cc..058cf32b 100644 --- a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java +++ b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java @@ -2,7 +2,7 @@ package nu.marginalia.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.model.dbcommon.DomainBlacklistImpl; import nu.marginalia.ranking.StandardPageRank; import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; @@ -32,7 +32,7 @@ public class CreateBrowseDomainRanksTool { logger.info("Ranking"); var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); + var domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds)); var rpr = new StandardPageRank(domains, args); uploader.start(); diff --git a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java index 0e615552..2a3c15ae 100644 --- a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java +++ b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java @@ -13,7 +13,7 @@ import lombok.SneakyThrows; import nu.marginalia.ranking.RankingAlgorithm; import nu.marginalia.ranking.data.RankingDomainData; import nu.marginalia.ranking.data.RankingDomainFetcher; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.model.dbcommon.DomainBlacklistImpl; import nu.marginalia.service.module.DatabaseModule; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; @@ -49,7 +49,7 @@ public class PerusePageRankV2 { @SneakyThrows public static void main(String... args) { var ds = new DatabaseModule().provideConnection(); - var blacklist = new EdgeDomainBlacklistImpl(ds); + var blacklist = new DomainBlacklistImpl(ds); var rank = new PerusePageRankV2(new RankingDomainFetcher(ds, blacklist)); long start = System.currentTimeMillis(); diff --git a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java index d608abad..11d71ddf 100644 --- a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java +++ b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java @@ -3,7 +3,7 @@ package nu.marginalia.ranking.tool; import lombok.SneakyThrows; import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; import nu.marginalia.ranking.data.RankingDomainFetcher; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.model.dbcommon.DomainBlacklistImpl; import nu.marginalia.ranking.StandardPageRank; import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; import nu.marginalia.service.module.DatabaseModule; @@ -35,11 +35,11 @@ public class PrintDomainRanksTool { RankingDomainFetcher domains; if (Boolean.getBoolean("use-link-data")) { - domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + domains = new RankingDomainFetcher(ds, new DomainBlacklistImpl(ds)); domains.retainNames(); } else { - domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); + domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds)); domains.retainNames(); } diff --git a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java index 804df19e..abd00f89 100644 --- a/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java +++ b/code/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java @@ -6,7 +6,7 @@ import nu.marginalia.ranking.StandardPageRank; import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.model.dbcommon.DomainBlacklistImpl; import nu.marginalia.service.module.DatabaseModule; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; @@ -33,7 +33,7 @@ public class UpdateDomainRanksTool { var uploader = new Thread(() -> uploadThread(conn), "Uploader"); logger.info("Ranking"); - var domains = new RankingDomainFetcherForSimilarityData(conn, new EdgeDomainBlacklistImpl(conn)); + var domains = new RankingDomainFetcherForSimilarityData(conn, new DomainBlacklistImpl(conn)); var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com"); rankMax = rpr.size(); diff --git a/code/features/pubdate/build.gradle b/code/features/pubdate/build.gradle new file mode 100644 index 00000000..caac0252 --- /dev/null +++ b/code/features/pubdate/build.gradle @@ -0,0 +1,44 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id "de.undercouch.download" version "5.1.0" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':code:common:model') + implementation project(':code:crawl:common') + + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.bundles.slf4j + implementation libs.guice + implementation libs.notnull + implementation libs.gson + implementation libs.jsoup + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + testImplementation project(':code:common:config') +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/features/pubdate/readme.md b/code/features/pubdate/readme.md new file mode 100644 index 00000000..40f28710 --- /dev/null +++ b/code/features/pubdate/readme.md @@ -0,0 +1,7 @@ +# Pubdate + +Contains advanced haruspicy for figuring out when a document was published. + +## Central Classes + +* [PubDateSniffer](src/main/java/nu/marginalia/pubdate/PubDateSniffer.java) \ No newline at end of file diff --git a/code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateEffortLevel.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateEffortLevel.java new file mode 100644 index 00000000..e2fd4e65 --- /dev/null +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateEffortLevel.java @@ -0,0 +1,6 @@ +package nu.marginalia.pubdate; + +public enum PubDateEffortLevel { + LOW, + HIGH +} diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateHeuristic.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java similarity index 56% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateHeuristic.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java index e2e67258..4f29567a 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateHeuristic.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java @@ -1,7 +1,7 @@ -package nu.marginalia.converting.processor.logic.pubdate; +package nu.marginalia.pubdate; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import org.jsoup.nodes.Document; @@ -9,5 +9,5 @@ import java.util.Optional; public interface PubDateHeuristic { - Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard); + Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard); } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateParser.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java similarity index 97% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateParser.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java index 131a5f3d..77f2808a 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateParser.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java @@ -1,6 +1,6 @@ -package nu.marginalia.converting.processor.logic.pubdate; +package nu.marginalia.pubdate; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import java.time.DateTimeException; @@ -122,7 +122,7 @@ public class PubDateParser { return (max + min) / 2; } - public static int guessYear(EdgeHtmlStandard standard) { + public static int guessYear(HtmlStandard standard) { // Create some jitter to avoid having documents piling up in the same four years // as this would make searching in those years disproportionately useless diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateSniffer.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java similarity index 87% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateSniffer.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java index 7eeca0d3..07d12b20 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateSniffer.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java @@ -1,9 +1,9 @@ -package nu.marginalia.converting.processor.logic.pubdate; +package nu.marginalia.pubdate; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.heuristic.*; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.pubdate.heuristic.*; import org.jsoup.nodes.Document; import java.util.ArrayList; @@ -36,7 +36,7 @@ public class PubDateSniffer { heuristics.add(new PubDateHeuristicGuessFromHtmlStandard()); } - public PubDate getPubDate(String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard, boolean runExpensive) { + public PubDate getPubDate(String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) { final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW; for (var heuristic : heuristics) { diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java similarity index 87% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java index 8d32d965..3c18a65c 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java @@ -1,11 +1,11 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateEffortLevel; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -18,7 +18,7 @@ import java.util.Optional; public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { if (effortLevel == PubDateEffortLevel.LOW) return Optional.empty(); @@ -32,9 +32,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { private static class DateExtractingNodeVisitorPass implements NodeFilter { public PubDate pubDate; - private final EdgeHtmlStandard htmlStandard; + private final HtmlStandard htmlStandard; - private DateExtractingNodeVisitorPass(EdgeHtmlStandard htmlStandard) { + private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) { this.htmlStandard = htmlStandard; } @@ -130,7 +130,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { } private void parse(String text) { - if (htmlStandard == EdgeHtmlStandard.UNKNOWN) { + if (htmlStandard == HtmlStandard.UNKNOWN) { PubDateParser .dateFromHighestYearLookingSubstring(text) .ifPresent(this::setPubDate); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java similarity index 83% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java index 4692153d..52d083dc 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java @@ -1,10 +1,10 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; @@ -17,7 +17,7 @@ import java.util.Optional; public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { if (effortLevel == PubDateEffortLevel.LOW) return Optional.empty(); @@ -31,9 +31,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic { private static class DateExtractingNodeVisitor implements NodeFilter { public PubDate pubDate; - private final EdgeHtmlStandard htmlStandard; + private final HtmlStandard htmlStandard; - private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) { + private DateExtractingNodeVisitor(HtmlStandard htmlStandard) { this.htmlStandard = htmlStandard; } @@ -71,7 +71,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic { } private void parse(String text) { - if (htmlStandard == EdgeHtmlStandard.UNKNOWN) { + if (htmlStandard == HtmlStandard.UNKNOWN) { PubDateParser .dateFromHighestYearLookingSubstring(text) .ifPresent(this::setPubDate); diff --git a/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java new file mode 100644 index 00000000..4a13c278 --- /dev/null +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java @@ -0,0 +1,23 @@ +package nu.marginalia.pubdate.heuristic; + +import nu.marginalia.crawling.common.model.HtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + if (htmlStandard == HtmlStandard.UNKNOWN) + return Optional.empty(); + + return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard))); + } + +} diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java similarity index 64% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java index ca0220ae..4a79007c 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; @@ -13,7 +13,7 @@ import java.util.Optional; public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { // HTML5, alternative approach for (var tag : document.select("time")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java similarity index 59% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java index c63e15b1..2e36f255 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; @@ -13,7 +13,7 @@ import java.util.Optional; public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { // HTML5 for (var tag : document.select("time[pubdate=\"pubdate\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java similarity index 58% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java index 5b2b7034..8c6bbdef 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; @@ -13,7 +13,7 @@ import java.util.Optional; public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { for (var tag : document.select("time[itemprop=\"datePublished\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); if (maybeDate.isPresent()) { diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java similarity index 73% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java index aedb0611..ebab7589 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java @@ -1,14 +1,14 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.gson.JsonSyntaxException; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; @@ -16,7 +16,7 @@ import java.util.Optional; public class PubDateHeuristicJSONLD implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { for (var tag : document.select("script[type=\"application/ld+json\"]")) { var maybeDate = parseLdJson(tag.data()) .flatMap(PubDateParser::attemptParseDate); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java similarity index 62% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java index f7ed3af9..d987c577 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java @@ -1,11 +1,11 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; @@ -13,7 +13,7 @@ import java.util.Optional; public class PubDateHeuristicLastModified implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { String lmString = "last-modified: "; int offset = headers.toLowerCase().indexOf(lmString); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java similarity index 58% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java index 75de4a71..5d4fbec3 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java @@ -1,11 +1,11 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; @@ -13,7 +13,7 @@ import java.util.Optional; public class PubDateHeuristicMicrodata implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { for (var tag : document.select("meta[itemprop=\"datePublished\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java similarity index 59% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java index 6ddd78d8..256d844e 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java @@ -1,10 +1,10 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; @@ -13,7 +13,7 @@ import java.util.Optional; public class PubDateHeuristicOpenGraph implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { // OG for (var tag : document.select("meta[property=\"article:published_time\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java similarity index 58% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java index 59f8e08d..561ef37e 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java @@ -1,10 +1,10 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; @@ -13,7 +13,7 @@ import java.util.Optional; public class PubDateHeuristicRDFaTag implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { for (var tag : document.select("meta[property=\"datePublished\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); if (maybeDate.isPresent()) { diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java similarity index 74% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java index 2756c089..ab5515b3 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java @@ -1,11 +1,11 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; @@ -20,7 +20,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic { private static final int MIN_URL_PATTERN_YEAR = 2000; @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { final String urlString = url.path; var matcher = yearUrlPattern.matcher(urlString); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java similarity index 71% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java rename to code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java index 6432d9c3..c8627eca 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java +++ b/code/features/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java @@ -1,11 +1,11 @@ -package nu.marginalia.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.pubdate.heuristic; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.pubdate.PubDateHeuristic; +import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; @@ -17,7 +17,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic { private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/"); @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { final String urlString = url.path; var matcher = yearUrlPattern.matcher(urlString); diff --git a/code/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PubDateSnifferTest.java b/code/features/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java similarity index 90% rename from code/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PubDateSnifferTest.java rename to code/features/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java index 53095944..d6c27528 100644 --- a/code/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PubDateSnifferTest.java +++ b/code/features/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java @@ -1,11 +1,9 @@ -package nu.marginalia.converting.logic; +package nu.marginalia.pubdate; import nu.marginalia.WmsaHome; -import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.converting.processor.logic.pubdate.PubDateSniffer; -import nu.marginalia.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.common.model.HtmlStandard; +import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; @@ -75,7 +73,7 @@ class PubDateSnifferTest { Wow, sure lor 'em boss - """), EdgeHtmlStandard.UNKNOWN, true); + """), HtmlStandard.UNKNOWN, true); assertFalse(ret.isEmpty()); assertEquals("2022-08-24", ret.dateIso8601()); @@ -91,7 +89,7 @@ class PubDateSnifferTest { Wow, sure lor 'em boss - """), EdgeHtmlStandard.UNKNOWN, true); + """), HtmlStandard.UNKNOWN, true); assertFalse(ret.isEmpty()); assertEquals("2022-08-24", ret.dateIso8601()); @@ -107,7 +105,7 @@ class PubDateSnifferTest { Wow, sure lor 'em boss - """), EdgeHtmlStandard.UNKNOWN, true); + """), HtmlStandard.UNKNOWN, true); assertFalse(ret.isEmpty()); assertEquals(2006, ret.year()); @@ -117,14 +115,14 @@ class PubDateSnifferTest { public void testProblemCases() throws IOException, URISyntaxException { var ret = dateSniffer.getPubDate("", new EdgeUrl("https://www.example.com/"), - Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true); + Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true); assertFalse(ret.isEmpty()); assertEquals(2006, ret.year()); ret = dateSniffer.getPubDate("", new EdgeUrl("https://www.example.com/"), - Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true); + Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true); assertFalse(ret.isEmpty()); assertEquals(2010, ret.year()); @@ -147,7 +145,7 @@ class PubDateSnifferTest { - """), EdgeHtmlStandard.UNKNOWN, true); + """), HtmlStandard.UNKNOWN, true); assertFalse(ret.isEmpty()); assertEquals("2022-08-24", ret.dateIso8601()); @@ -161,7 +159,7 @@ class PubDateSnifferTest { - """),EdgeHtmlStandard.UNKNOWN, true); + """), HtmlStandard.UNKNOWN, true); assertFalse(ret.isEmpty()); assertEquals("2022-08-24", ret.dateIso8601()); @@ -175,7 +173,7 @@ class PubDateSnifferTest { - """), EdgeHtmlStandard.UNKNOWN, true); + """), HtmlStandard.UNKNOWN, true); assertFalse(ret.isEmpty()); assertEquals("2004-08-24", ret.dateIso8601()); @@ -189,7 +187,7 @@ class PubDateSnifferTest { No date in the HTML - """), EdgeHtmlStandard.UNKNOWN, true); + """), HtmlStandard.UNKNOWN, true); assertFalse(ret.isEmpty()); assertNull(ret.dateIso8601()); @@ -204,7 +202,7 @@ class PubDateSnifferTest { No date in the HTML - """), EdgeHtmlStandard.UNKNOWN, true); + """), HtmlStandard.UNKNOWN, true); assertFalse(ret.isEmpty()); assertEquals("2022-02-03", ret.dateIso8601()); @@ -219,7 +217,7 @@ class PubDateSnifferTest {

Published 2003, updated 2022

- """), EdgeHtmlStandard.HTML5, true); + """), HtmlStandard.HTML5, true); assertFalse(ret.isEmpty()); assertNull(ret.dateIso8601()); @@ -245,7 +243,7 @@ class PubDateSnifferTest {
 Post subject: Keyboards.
Post #1 Posted: Sun Oct 03, 2010 5:37 pm 
- """), EdgeHtmlStandard.UNKNOWN, true); + """), HtmlStandard.UNKNOWN, true); assertFalse(ret.isEmpty()); assertNull(ret.dateIso8601()); diff --git a/code/features/query-parser/build.gradle b/code/features/query-parser/build.gradle index ee29e5aa..25dfb196 100644 --- a/code/features/query-parser/build.gradle +++ b/code/features/query-parser/build.gradle @@ -13,7 +13,6 @@ java { } dependencies { implementation project(':code:libraries:language-processing') - implementation project(':code:libraries:misc') implementation project(':code:common:config') implementation project(':code:common:model') diff --git a/code/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryParser.java b/code/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryParser.java index eebf2daa..5b01fc7b 100644 --- a/code/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryParser.java +++ b/code/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryParser.java @@ -3,7 +3,7 @@ package nu.marginalia.query_parser; import nu.marginalia.language.WordPatterns; import nu.marginalia.query_parser.token.Token; import nu.marginalia.query_parser.token.TokenType; -import nu.marginalia.util.TransformList; +import nu.marginalia.transform_list.TransformList; import java.util.List; diff --git a/code/libraries/misc/src/main/java/nu/marginalia/util/TransformList.java b/code/features/query-parser/src/main/java/nu/marginalia/transform_list/TransformList.java similarity index 99% rename from code/libraries/misc/src/main/java/nu/marginalia/util/TransformList.java rename to code/features/query-parser/src/main/java/nu/marginalia/transform_list/TransformList.java index 352b39cb..20b2b5ea 100644 --- a/code/libraries/misc/src/main/java/nu/marginalia/util/TransformList.java +++ b/code/features/query-parser/src/main/java/nu/marginalia/transform_list/TransformList.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.transform_list; import java.util.List; import java.util.function.BiConsumer; diff --git a/code/libraries/misc/src/test/java/nu/marginalia/util/TransformListTest.java b/code/features/query-parser/src/test/java/nu/marginalia/transform_list/TransformListTest.java similarity index 98% rename from code/libraries/misc/src/test/java/nu/marginalia/util/TransformListTest.java rename to code/features/query-parser/src/test/java/nu/marginalia/transform_list/TransformListTest.java index 2a9ea325..d34a86ea 100644 --- a/code/libraries/misc/src/test/java/nu/marginalia/util/TransformListTest.java +++ b/code/features/query-parser/src/test/java/nu/marginalia/transform_list/TransformListTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.transform_list; import org.junit.jupiter.api.Test; diff --git a/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java b/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java index 2f0b4cc0..2dd503b4 100644 --- a/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java +++ b/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java @@ -5,7 +5,7 @@ import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.dbcommon.DomainBlacklist; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -23,7 +23,7 @@ public class DbBrowseDomainsRandom { this.dataSource = dataSource; } - public List getRandomDomains(int count, EdgeDomainBlacklist blacklist, int set) { + public List getRandomDomains(int count, DomainBlacklist blacklist, int set) { final String q = """ SELECT DOMAIN_ID, DOMAIN_NAME diff --git a/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java b/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java index 6f3d9bd8..cdeac7fd 100644 --- a/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java +++ b/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java @@ -5,7 +5,7 @@ import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.dbcommon.DomainBlacklist; import nu.marginalia.model.id.EdgeId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,7 +24,7 @@ public class DbBrowseDomainsSimilarCosine { this.dataSource = dataSource; } - public List getDomainNeighborsAdjacentCosine(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { + public List getDomainNeighborsAdjacentCosine(EdgeId domainId, DomainBlacklist blacklist, int count) { List domains = new ArrayList<>(count); String q = """ diff --git a/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java b/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java index 01f43060..0ab6ade6 100644 --- a/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java +++ b/code/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java @@ -6,7 +6,7 @@ import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.dbcommon.DomainBlacklist; import nu.marginalia.model.id.EdgeId; import nu.marginalia.model.id.EdgeIdCollection; import org.slf4j.Logger; @@ -26,7 +26,7 @@ public class DbBrowseDomainsSimilarOldAlgo { this.dataSource = dataSource; } - public List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { + public List getDomainNeighborsAdjacent(EdgeId domainId, DomainBlacklist blacklist, int count) { final Set domains = new HashSet<>(count*3); final String q = """ @@ -131,7 +131,7 @@ public class DbBrowseDomainsSimilarOldAlgo { return new ArrayList<>(domains); } - public List getRandomDomains(int count, EdgeDomainBlacklist blacklist, int set) { + public List getRandomDomains(int count, DomainBlacklist blacklist, int set) { final String q = """ SELECT DOMAIN_ID, DOMAIN_NAME diff --git a/code/features/readme.md b/code/features/readme.md index 4e5bbb22..ddd34723 100644 --- a/code/features/readme.md +++ b/code/features/readme.md @@ -7,5 +7,8 @@ search engine code. * [domain-ranking](domain-ranking/) contains ranking algorithms. * [query-parser](query-parser/) contains code for parsing the user-facing query grammar. +* [adblock](adblock/) +* [pubdate](pubdate/) + * [screenshots](screenshots/) and [random-websites](random-websites/) contains SQL queries random exploration mode. \ No newline at end of file diff --git a/code/features/topic-detection/build.gradle b/code/features/topic-detection/build.gradle new file mode 100644 index 00000000..035e9974 --- /dev/null +++ b/code/features/topic-detection/build.gradle @@ -0,0 +1,43 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id "de.undercouch.download" version "5.1.0" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':code:common:config') + implementation project(':code:libraries:language-processing') + implementation project(':third-party') + + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.bundles.slf4j + implementation libs.guice + implementation libs.notnull + implementation libs.jsoup + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/features/topic-detection/readme.md b/code/features/topic-detection/readme.md new file mode 100644 index 00000000..db9a0000 --- /dev/null +++ b/code/features/topic-detection/readme.md @@ -0,0 +1,4 @@ +# Topic Detection + +This is an experiment in using hand-crafted naive bayesian filters to detecting the topic of a website. +It's noteworthy it detects recipes very well. \ No newline at end of file diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/RecipeDetector.java b/code/features/topic-detection/src/main/java/nu/marginalia/topic/RecipeDetector.java similarity index 99% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/RecipeDetector.java rename to code/features/topic-detection/src/main/java/nu/marginalia/topic/RecipeDetector.java index 29dea927..2c44d568 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/RecipeDetector.java +++ b/code/features/topic-detection/src/main/java/nu/marginalia/topic/RecipeDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic.topic; +package nu.marginalia.topic; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/TextileCraftDetector.java b/code/features/topic-detection/src/main/java/nu/marginalia/topic/TextileCraftDetector.java similarity index 99% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/TextileCraftDetector.java rename to code/features/topic-detection/src/main/java/nu/marginalia/topic/TextileCraftDetector.java index 771d1491..64ccaf2e 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/TextileCraftDetector.java +++ b/code/features/topic-detection/src/main/java/nu/marginalia/topic/TextileCraftDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic.topic; +package nu.marginalia.topic; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/WoodworkingDetector.java b/code/features/topic-detection/src/main/java/nu/marginalia/topic/WoodworkingDetector.java similarity index 99% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/WoodworkingDetector.java rename to code/features/topic-detection/src/main/java/nu/marginalia/topic/WoodworkingDetector.java index fd9be203..32e362d2 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/WoodworkingDetector.java +++ b/code/features/topic-detection/src/main/java/nu/marginalia/topic/WoodworkingDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic.topic; +package nu.marginalia.topic; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; diff --git a/code/index/index-forward/build.gradle b/code/index/index-forward/build.gradle index 9dff1fe4..8a789e59 100644 --- a/code/index/index-forward/build.gradle +++ b/code/index/index-forward/build.gradle @@ -13,7 +13,6 @@ java { dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') - implementation project(':code:libraries:misc') implementation project(':code:features:domain-ranking') implementation project(':code:index:index-query') implementation project(':code:index:index-journal') diff --git a/code/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java b/code/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java index 1e096b0b..dba81461 100644 --- a/code/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java +++ b/code/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java @@ -1,18 +1,18 @@ package nu.marginalia.index.reverse; import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; public class ReverseIndexPriorityParameters { private static final long highPriorityFlags = - EdgePageWordFlags.Title.asBit() - | EdgePageWordFlags.Subjects.asBit() - | EdgePageWordFlags.TfIdfHigh.asBit() - | EdgePageWordFlags.NamesWords.asBit() - | EdgePageWordFlags.UrlDomain.asBit() - | EdgePageWordFlags.UrlPath.asBit() - | EdgePageWordFlags.Site.asBit() - | EdgePageWordFlags.SiteAdjacent.asBit(); + WordFlags.Title.asBit() + | WordFlags.Subjects.asBit() + | WordFlags.TfIdfHigh.asBit() + | WordFlags.NamesWords.asBit() + | WordFlags.UrlDomain.asBit() + | WordFlags.UrlPath.asBit() + | WordFlags.Site.asBit() + | WordFlags.SiteAdjacent.asBit(); public static boolean filterPriorityRecord(IndexJournalEntryData.Record record) { long meta = record.metadata(); diff --git a/code/index/lexicon/build.gradle b/code/index/lexicon/build.gradle index 7b0e8dd7..18da060e 100644 --- a/code/index/lexicon/build.gradle +++ b/code/index/lexicon/build.gradle @@ -13,8 +13,8 @@ java { } dependencies { - implementation project(':code:libraries:misc') + implementation project(':code:libraries:next-prime') implementation libs.lombok annotationProcessor libs.lombok implementation libs.bundles.slf4j diff --git a/code/index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java b/code/index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java index 781c3b5c..e17c9c19 100644 --- a/code/index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java +++ b/code/index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java @@ -1,6 +1,6 @@ package nu.marginalia.dict; -import nu.marginalia.util.PrimeUtil; +import nu.marginalia.util.NextPrimeUtil; import java.nio.ByteBuffer; import java.nio.IntBuffer; @@ -33,7 +33,7 @@ public class OffHeapDictionaryHashMap implements DictionaryMap { buffers = new IntBuffer[bufferCount]; // Actually use a prime size for Donald Knuth reasons - hashTableSize = PrimeUtil.nextPrime(sizeMemory, -1); + hashTableSize = NextPrimeUtil.nextPrime(sizeMemory, -1); intsPerBuffer = 1 + (int)(sizeMemory/ bufferCount); bufferSizeBytes = intSize*intsPerBuffer; diff --git a/code/libraries/misc/build.gradle b/code/libraries/braille-block-punch-cards/build.gradle similarity index 91% rename from code/libraries/misc/build.gradle rename to code/libraries/braille-block-punch-cards/build.gradle index 26013e99..9a8058b9 100644 --- a/code/libraries/misc/build.gradle +++ b/code/libraries/braille-block-punch-cards/build.gradle @@ -9,9 +9,6 @@ java { } dependencies { - implementation project(':third-party') - - implementation libs.lombok annotationProcessor libs.lombok implementation libs.bundles.slf4j diff --git a/code/libraries/braille-block-punch-cards/readme.md b/code/libraries/braille-block-punch-cards/readme.md new file mode 100644 index 00000000..1785a2fc --- /dev/null +++ b/code/libraries/braille-block-punch-cards/readme.md @@ -0,0 +1,9 @@ +# Braille Block Punch Cards + +Used to render 2x8 bit matrices using the Braille block from integers. + +This is The Way when it comes to representing bit masks to humans. + +## Central Classes + +* [BrailleBlockPunchCards](src/main/java/nu/marginalia/bbpc/BrailleBlockPunchCards.java) \ No newline at end of file diff --git a/code/common/model/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java b/code/libraries/braille-block-punch-cards/src/main/java/nu/marginalia/bbpc/BrailleBlockPunchCards.java similarity index 98% rename from code/common/model/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java rename to code/libraries/braille-block-punch-cards/src/main/java/nu/marginalia/bbpc/BrailleBlockPunchCards.java index 5877ae03..2105e93a 100644 --- a/code/common/model/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java +++ b/code/libraries/braille-block-punch-cards/src/main/java/nu/marginalia/bbpc/BrailleBlockPunchCards.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.bbpc; public class BrailleBlockPunchCards { diff --git a/code/common/model/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java b/code/libraries/braille-block-punch-cards/src/test/java/nu/marginalia/bbpc/BrailleBlockPunchCardsTest.java similarity index 91% rename from code/common/model/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java rename to code/libraries/braille-block-punch-cards/src/test/java/nu/marginalia/bbpc/BrailleBlockPunchCardsTest.java index 0efe59d8..05d7b84c 100644 --- a/code/common/model/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java +++ b/code/libraries/braille-block-punch-cards/src/test/java/nu/marginalia/bbpc/BrailleBlockPunchCardsTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.bbpc; import org.junit.jupiter.api.Test; diff --git a/code/libraries/btree/build.gradle b/code/libraries/btree/build.gradle index 7c6854a9..c5a9950e 100644 --- a/code/libraries/btree/build.gradle +++ b/code/libraries/btree/build.gradle @@ -11,12 +11,12 @@ java { dependencies { implementation project(':third-party') implementation project(':code:libraries:array') + implementation project(':code:libraries:next-prime') implementation libs.lombok annotationProcessor libs.lombok implementation libs.bundles.slf4j - testImplementation project(':code:libraries:misc') testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito diff --git a/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderQueryDataWithIndexTest.java b/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderQueryDataWithIndexTest.java index ed1d2b28..fa00d98c 100644 --- a/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderQueryDataWithIndexTest.java +++ b/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderQueryDataWithIndexTest.java @@ -1,18 +1,14 @@ package nu.marginalia.btree; import nu.marginalia.array.LongArray; -import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.btree.model.BTreeBlockSize; import nu.marginalia.btree.model.BTreeContext; -import nu.marginalia.util.PrimeUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import java.io.IOException; -import java.util.Arrays; import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; public class BTreeReaderQueryDataWithIndexTest { BTreeContext ctx = new BTreeContext(5, 2, BTreeBlockSize.BS_32); diff --git a/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java b/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java index 7e77fb9a..8b65753d 100644 --- a/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java +++ b/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java @@ -4,7 +4,7 @@ import nu.marginalia.array.LongArray; import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.btree.model.BTreeBlockSize; import nu.marginalia.btree.model.BTreeContext; -import nu.marginalia.util.PrimeUtil; +import nu.marginalia.util.NextPrimeUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -24,7 +24,7 @@ public class BTreeReaderRejectRetainWithIndexTest { int p = 2; for (int idx = 0; idx < 1000; idx++) { slice.set(idx, p); - p = (int) PrimeUtil.nextPrime(p + 1, 1); + p = (int) NextPrimeUtil.nextPrime(p + 1, 1); } }); } diff --git a/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java b/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java index 25dba326..e5d4dc79 100644 --- a/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java +++ b/code/libraries/btree/src/test/java/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java @@ -4,7 +4,7 @@ import nu.marginalia.array.LongArray; import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.btree.model.BTreeBlockSize; import nu.marginalia.btree.model.BTreeContext; -import nu.marginalia.util.PrimeUtil; +import nu.marginalia.util.NextPrimeUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -24,7 +24,7 @@ public class BTreeReaderRejectRetainWithoutIndexTest { int p = 2; for (int idx = 0; idx < 1000; idx++) { slice.set(idx, p); - p = (int) PrimeUtil.nextPrime(p + 1, 1); + p = (int) NextPrimeUtil.nextPrime(p + 1, 1); } }); } diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java index c4a9068a..0cdc7bed 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java @@ -2,7 +2,7 @@ package nu.marginalia.language.model; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import java.util.EnumSet; import java.util.HashSet; @@ -21,47 +21,47 @@ public final class KeywordMetadata { public final Object2IntOpenHashMap wordsTfIdf; public final Object2IntOpenHashMap positionMask; - private final EnumSet wordFlagsTemplate; + private final EnumSet wordFlagsTemplate; - public KeywordMetadata(EnumSet flags) { + public KeywordMetadata(EnumSet flags) { this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f); this.wordsTfIdf = new Object2IntOpenHashMap<>(10_000, 0.7f); this.wordFlagsTemplate = flags; } public KeywordMetadata() { - this(EnumSet.noneOf(EdgePageWordFlags.class)); + this(EnumSet.noneOf(WordFlags.class)); } - public long getMetadataForWord(EnumSet flagsTemplate, String stemmed) { + public long getMetadataForWord(EnumSet flagsTemplate, String stemmed) { int tfidf = wordsTfIdf.getOrDefault(stemmed, 0); - EnumSet flags = flagsTemplate.clone(); + EnumSet flags = flagsTemplate.clone(); if (tfidf > 100) - flags.add(EdgePageWordFlags.TfIdfHigh); + flags.add(WordFlags.TfIdfHigh); if (subjectKeywords.contains(stemmed)) - flags.add(EdgePageWordFlags.Subjects); + flags.add(WordFlags.Subjects); if (namesKeywords.contains(stemmed)) - flags.add(EdgePageWordFlags.NamesWords); + flags.add(WordFlags.NamesWords); if (titleKeywords.contains(stemmed)) - flags.add(EdgePageWordFlags.Title); + flags.add(WordFlags.Title); if (urlKeywords.contains(stemmed)) - flags.add(EdgePageWordFlags.UrlPath); + flags.add(WordFlags.UrlPath); if (domainKeywords.contains(stemmed)) - flags.add(EdgePageWordFlags.UrlDomain); + flags.add(WordFlags.UrlDomain); int positions = positionMask.getOrDefault(stemmed, 0); return new WordMetadata(tfidf, positions, flags).encode(); } - public EnumSet wordFlagsTemplate() { + public EnumSet wordFlagsTemplate() { return wordFlagsTemplate; } diff --git a/code/common/model/src/main/java/nu/marginalia/util/DenseBitMap.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/DenseBitMap.java similarity index 97% rename from code/common/model/src/main/java/nu/marginalia/util/DenseBitMap.java rename to code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/DenseBitMap.java index 88d2fa18..c31a386c 100644 --- a/code/common/model/src/main/java/nu/marginalia/util/DenseBitMap.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/DenseBitMap.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.language.statistics; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java index c842ee5b..367ccf37 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java @@ -5,7 +5,6 @@ import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.inject.Inject; import nu.marginalia.LanguageModels; -import nu.marginalia.util.DenseBitMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/common/model/src/test/java/nu/marginalia/util/DenseBitMapTest.java b/code/libraries/language-processing/src/test/java/nu/marginalia/language/statistics/DenseBitMapTest.java similarity index 88% rename from code/common/model/src/test/java/nu/marginalia/util/DenseBitMapTest.java rename to code/libraries/language-processing/src/test/java/nu/marginalia/language/statistics/DenseBitMapTest.java index 5f6d6aec..1a506c4c 100644 --- a/code/common/model/src/test/java/nu/marginalia/util/DenseBitMapTest.java +++ b/code/libraries/language-processing/src/test/java/nu/marginalia/language/statistics/DenseBitMapTest.java @@ -1,9 +1,9 @@ -package nu.marginalia.util; +package nu.marginalia.language.statistics; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; class DenseBitMapTest { diff --git a/code/libraries/misc/src/test/java/nu/marginalia/test/TestUtil.java b/code/libraries/misc/src/test/java/nu/marginalia/test/TestUtil.java deleted file mode 100644 index 44a489bb..00000000 --- a/code/libraries/misc/src/test/java/nu/marginalia/test/TestUtil.java +++ /dev/null @@ -1,50 +0,0 @@ -package nu.marginalia.test; - - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; - -public class TestUtil { - private static boolean isTempDir(Path dir) { - return dir.startsWith("/tmp") || dir.toString().contains("tmp"); - } - - public static void clearTempDir(Path dir) { - if (!isTempDir(dir)) { - throw new IllegalArgumentException("Refusing to recursively delete directory with that name"); - } - if (Files.isDirectory(dir)) { - for (File f : dir.toFile().listFiles()) { - File[] files = f.listFiles(); - if (files != null) { - Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); - } - System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); - f.delete(); - } - } - System.out.println("Deleting " + dir); - dir.toFile().delete(); - } - - private static String fileSize(Path path) { - try { - long sizeBytes = Files.size(path); - - if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; - if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; - if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; - return sizeBytes + "b"; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - - private static String round(double d) { - return String.format("%.2f", d); - } -} diff --git a/code/libraries/next-prime/build.gradle b/code/libraries/next-prime/build.gradle new file mode 100644 index 00000000..9a8058b9 --- /dev/null +++ b/code/libraries/next-prime/build.gradle @@ -0,0 +1,27 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.notnull + + implementation libs.fastutil + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} diff --git a/code/libraries/next-prime/readme.md b/code/libraries/next-prime/readme.md new file mode 100644 index 00000000..a6b2a134 --- /dev/null +++ b/code/libraries/next-prime/readme.md @@ -0,0 +1,4 @@ +# Next Prime Util + +This is a brute force prime sieve. If finding many (or large) primes quickly +is important to you, don't use code like this. \ No newline at end of file diff --git a/code/libraries/misc/src/main/java/nu/marginalia/util/PrimeUtil.java b/code/libraries/next-prime/src/main/java/nu/marginalia/util/NextPrimeUtil.java similarity index 93% rename from code/libraries/misc/src/main/java/nu/marginalia/util/PrimeUtil.java rename to code/libraries/next-prime/src/main/java/nu/marginalia/util/NextPrimeUtil.java index 3f9be79d..183344b7 100644 --- a/code/libraries/misc/src/main/java/nu/marginalia/util/PrimeUtil.java +++ b/code/libraries/next-prime/src/main/java/nu/marginalia/util/NextPrimeUtil.java @@ -1,7 +1,6 @@ package nu.marginalia.util; -// This is not a fast way of finding primes -public class PrimeUtil { +public class NextPrimeUtil { /** Returns the next prime value starting at start. If start is prime, return start. */ diff --git a/code/libraries/next-prime/src/test/java/nu/marginalia/util/NextPrimeUtilTest.java b/code/libraries/next-prime/src/test/java/nu/marginalia/util/NextPrimeUtilTest.java new file mode 100644 index 00000000..381490cf --- /dev/null +++ b/code/libraries/next-prime/src/test/java/nu/marginalia/util/NextPrimeUtilTest.java @@ -0,0 +1,29 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class NextPrimeUtilTest { + + @Test + void isPrime() { + Assertions.assertTrue(NextPrimeUtil.isPrime(1)); + Assertions.assertTrue(NextPrimeUtil.isPrime(2)); + Assertions.assertTrue(NextPrimeUtil.isPrime(3)); + Assertions.assertFalse(NextPrimeUtil.isPrime(4)); + Assertions.assertTrue(NextPrimeUtil.isPrime(5)); + Assertions.assertFalse(NextPrimeUtil.isPrime(6)); + Assertions.assertTrue(NextPrimeUtil.isPrime(7)); + Assertions.assertFalse(NextPrimeUtil.isPrime(8)); + Assertions.assertFalse(NextPrimeUtil.isPrime(9)); + Assertions.assertFalse(NextPrimeUtil.isPrime(10)); + Assertions.assertTrue(NextPrimeUtil.isPrime(11)); + } + + @Test + void nextPrime() { + System.out.println(NextPrimeUtil.nextPrime(1L<<31, -1)); + System.out.println(NextPrimeUtil.nextPrime(1L<<31, 1)); + + } +} \ No newline at end of file diff --git a/code/libraries/readme.md b/code/libraries/readme.md index 8c914b8d..af41df4a 100644 --- a/code/libraries/readme.md +++ b/code/libraries/readme.md @@ -14,7 +14,5 @@ bad support for. It's designed to be able to easily replaced when *Java's Foreig * [big-string](big-string/) offers seamless string compression * [random-write-funnel](random-write-funnel/) is a tool for reducing write amplification when constructing large files out of order. - -## The rest - -* [misc](misc/) is just random bits and bobs that didn't fit anywhere. \ No newline at end of file +* [next-prime](next-prime/) naive brute force prime sieve. +* [braille-block-punch-cards](braille-block-punch-cards/) renders bit masks into human-readable dot matrices using the braille block. \ No newline at end of file diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index 4218a215..42b6d8c7 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -30,7 +30,6 @@ dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') - implementation project(':code:libraries:misc') implementation project(':code:index:index-journal') implementation project(':code:index:index-query') diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index 9ea28aec..63abbcf7 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -6,7 +6,7 @@ import gnu.trove.set.hash.TLongHashSet; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import nu.marginalia.index.svc.SearchTermsService; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.client.model.results.SearchResultItem; @@ -144,19 +144,19 @@ public class IndexResultValuator { private boolean filterRequired(long metadata, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { - return EdgePageWordFlags.Site.isPresent(metadata); + return WordFlags.Site.isPresent(metadata); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { - return EdgePageWordFlags.Subjects.isPresent(metadata); + return WordFlags.Subjects.isPresent(metadata); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { - return EdgePageWordFlags.Title.isPresent(metadata); + return WordFlags.Title.isPresent(metadata); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { - return EdgePageWordFlags.UrlPath.isPresent(metadata); + return WordFlags.UrlPath.isPresent(metadata); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { - return EdgePageWordFlags.UrlDomain.isPresent(metadata); + return WordFlags.UrlDomain.isPresent(metadata); } return true; } @@ -166,9 +166,9 @@ public class IndexResultValuator { long maskDirectRaw = ~0; long maskAdjacent = ~0; - final int flagBitMask = EdgePageWordFlags.Title.asBit() - | EdgePageWordFlags.Subjects.asBit() - | EdgePageWordFlags.Synthetic.asBit(); + final int flagBitMask = WordFlags.Title.asBit() + | WordFlags.Subjects.asBit() + | WordFlags.Synthetic.asBit(); int termCount = 0; double tfIdfSum = 1.; diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/service/util/PrimeUtilTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/service/util/PrimeUtilTest.java deleted file mode 100644 index c11d9719..00000000 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/service/util/PrimeUtilTest.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.index.service.util; - -import nu.marginalia.util.PrimeUtil; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -class PrimeUtilTest { - - @Test - void isPrime() { - Assertions.assertTrue(PrimeUtil.isPrime(1)); - Assertions.assertTrue(PrimeUtil.isPrime(2)); - Assertions.assertTrue(PrimeUtil.isPrime(3)); - Assertions.assertFalse(PrimeUtil.isPrime(4)); - Assertions.assertTrue(PrimeUtil.isPrime(5)); - Assertions.assertFalse(PrimeUtil.isPrime(6)); - Assertions.assertTrue(PrimeUtil.isPrime(7)); - Assertions.assertFalse(PrimeUtil.isPrime(8)); - Assertions.assertFalse(PrimeUtil.isPrime(9)); - Assertions.assertFalse(PrimeUtil.isPrime(10)); - Assertions.assertTrue(PrimeUtil.isPrime(11)); - } - - @Test - void nextPrime() { - System.out.println(PrimeUtil.nextPrime(1L<<31, -1)); - System.out.println(PrimeUtil.nextPrime(1L<<31, 1)); - - } -} \ No newline at end of file diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index 8c75c4e0..88ed3a4b 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -14,7 +14,7 @@ import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.lexicon.KeywordLexicon; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.service.server.Initialization; @@ -169,7 +169,7 @@ public class IndexQueryServiceIntegrationTest { long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); - data[2*i + 1] = new WordMetadata(i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); + data[2*i + 1] = new WordMetadata(i, i, EnumSet.of(WordFlags.Title)).encode(); } indexJournalWriter.put(header, new IndexJournalEntryData(data)); @@ -182,7 +182,7 @@ public class IndexQueryServiceIntegrationTest { long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); - data[2*i + 1] = new WordMetadata(i % 20, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); + data[2*i + 1] = new WordMetadata(i % 20, i, EnumSet.of(WordFlags.Title)).encode(); } indexJournalWriter.put(header, new IndexJournalEntryData(data)); diff --git a/code/services-core/search-service/build.gradle b/code/services-core/search-service/build.gradle index ef641f92..58a205e3 100644 --- a/code/services-core/search-service/build.gradle +++ b/code/services-core/search-service/build.gradle @@ -27,9 +27,9 @@ dependencies { implementation project(':code:common:config') implementation project(':code:index:index-query') - implementation project(':code:libraries:misc') implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:braille-block-punch-cards') implementation project(':code:api:assistant-api') implementation project(':code:api:index-api') diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java index b6f009da..aae4cd99 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java @@ -8,7 +8,7 @@ import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.browse.model.BrowseResultSet; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.dbcommon.DbDomainQueries; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.dbcommon.DomainBlacklist; import nu.marginalia.search.command.SearchCommandInterface; import nu.marginalia.search.command.SearchParameters; import nu.marginalia.search.results.BrowseResultCleaner; @@ -28,7 +28,7 @@ public class BrowseCommand implements SearchCommandInterface { private final DbBrowseDomainsSimilarCosine similarDomains; private final DbBrowseDomainsSimilarOldAlgo similarDomainsOld; private final DbDomainQueries domainQueries; - private final EdgeDomainBlacklist blacklist; + private final DomainBlacklist blacklist; private final MustacheRenderer browseResultsRenderer; private final BrowseResultCleaner browseResultCleaner; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -38,7 +38,7 @@ public class BrowseCommand implements SearchCommandInterface { public BrowseCommand(DbBrowseDomainsRandom randomDomains, DbBrowseDomainsSimilarCosine similarDomains, DbBrowseDomainsSimilarOldAlgo similarDomainsOld, DbDomainQueries domainQueries, - EdgeDomainBlacklist blacklist, + DomainBlacklist blacklist, RendererFactory rendererFactory, BrowseResultCleaner browseResultCleaner) throws IOException diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SearchCommand.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SearchCommand.java index 63f12470..f7255e8a 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SearchCommand.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SearchCommand.java @@ -2,15 +2,13 @@ package nu.marginalia.search.command.commands; import com.google.inject.Inject; import nu.marginalia.client.Context; -import nu.marginalia.model.dbcommon.DbDomainQueries; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.dbcommon.DomainBlacklist; import nu.marginalia.search.SearchOperator; import nu.marginalia.search.command.SearchCommandInterface; import nu.marginalia.search.command.SearchParameters; import nu.marginalia.search.model.DecoratedSearchResults; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.query.model.UserSearchParameters; -import nu.marginalia.search.results.BrowseResultCleaner; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; @@ -18,13 +16,13 @@ import java.io.IOException; import java.util.Optional; public class SearchCommand implements SearchCommandInterface { - private final EdgeDomainBlacklist blacklist; + private final DomainBlacklist blacklist; private final SearchOperator searchOperator; private final MustacheRenderer searchResultsRenderer; @Inject - public SearchCommand(EdgeDomainBlacklist blacklist, + public SearchCommand(DomainBlacklist blacklist, SearchOperator searchOperator, RendererFactory rendererFactory ) throws IOException { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java index 25bec7d4..1c760c54 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java @@ -7,7 +7,7 @@ import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.id.EdgeId; import nu.marginalia.model.id.EdgeIdCollection; import nu.marginalia.search.model.PageScoreAdjustment; @@ -82,7 +82,7 @@ public class DbUrlDetailsQuery { rsp.getString(8), // format rsp.getInt(9), // features rsp.getString(10), // ip - EdgeDomainIndexingState.valueOf(rsp.getString(11)), // domainState + DomainIndexingState.valueOf(rsp.getString(11)), // domainState rsp.getLong(12), // dataHash PageScoreAdjustment.zero(), // urlQualityAdjustment Integer.MAX_VALUE, // rankingId diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index 5b9b85f0..2f97e49f 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -3,7 +3,7 @@ package nu.marginalia.search.model; import lombok.*; import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.HtmlFeature; import java.util.EnumSet; @@ -25,7 +25,7 @@ public class UrlDetails { public int features; public String ip; - public EdgeDomainIndexingState domainState; + public DomainIndexingState domainState; public long dataHash; @@ -182,7 +182,7 @@ public class UrlDetails { public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); } public boolean isSpecialDomain() { - return domainState == EdgeDomainIndexingState.SPECIAL; + return domainState == DomainIndexingState.SPECIAL; } public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java index b08054e5..7b1907b1 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java @@ -4,14 +4,14 @@ import com.google.inject.Inject; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntObjectHashMap; import it.unimi.dsi.fastutil.ints.Int2IntArrayMap; +import nu.marginalia.bbpc.BrailleBlockPunchCards; import nu.marginalia.search.db.DbUrlDetailsQuery; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.id.EdgeIdList; import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.valuation.SearchResultValuator; -import nu.marginalia.util.BrailleBlockPunchCards; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -97,7 +97,7 @@ public class SearchResultDecorator { private double calculateTermScore(SearchResultItem resultItem, UrlDetails details) { - final double statePenalty = (details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0; + final double statePenalty = (details.domainState == DomainIndexingState.SPECIAL) ? 1.25 : 0; final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length()); return value + statePenalty; diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java index 092f2aff..e52f2a48 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java @@ -3,7 +3,7 @@ package nu.marginalia.search.siteinfo; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.dbcommon.DbDomainQueries; import nu.marginalia.model.id.EdgeId; import nu.marginalia.search.model.DomainInformation; @@ -63,7 +63,7 @@ public class DomainInformationService { double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; - EdgeDomainIndexingState state = getDomainState(domainId); + DomainIndexingState state = getDomainState(domainId); List linkingDomains = getLinkingDomains(domainId); var di = DomainInformation.builder() @@ -229,14 +229,14 @@ public class DomainInformationService { } } - public EdgeDomainIndexingState getDomainState(EdgeId domainId) { + public DomainIndexingState getDomainState(EdgeId domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { - return EdgeDomainIndexingState.valueOf(rsp.getString(1)); + return DomainIndexingState.valueOf(rsp.getString(1)); } } catch (Exception ex) { logger.error("DB error", ex); @@ -244,7 +244,7 @@ public class DomainInformationService { } catch (SQLException throwables) { throwables.printStackTrace(); } - return EdgeDomainIndexingState.ERROR; + return DomainIndexingState.ERROR; } public List getLinkingDomains(EdgeId domainId) { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java index 8bdd791c..46fb0cb5 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java @@ -3,7 +3,7 @@ package nu.marginalia.search.valuation; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.language.statistics.TermFrequencyDict; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.index.client.model.results.SearchResultKeywordScore; import nu.marginalia.index.client.model.query.SearchSubquery; @@ -140,23 +140,23 @@ public class SearchResultValuator { private double calculateSingleTermBonus(SearchResultsKeywordSet set, double totalFactor) { var theKeyword = set.iterator().next(); - if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Title)) { + if (theKeyword.wordMetadata.hasFlag(WordFlags.Title)) { return totalFactor * 0.5; } - else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Subjects)) { + else if (theKeyword.wordMetadata.hasFlag(WordFlags.Subjects)) { return totalFactor * 0.6; } - else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.SiteAdjacent)) { + else if (theKeyword.wordMetadata.hasFlag(WordFlags.SiteAdjacent)) { return totalFactor * 0.65; } - else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Site)) { + else if (theKeyword.wordMetadata.hasFlag(WordFlags.Site)) { return totalFactor * 0.7; } - if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.UrlDomain)) { + if (theKeyword.wordMetadata.hasFlag(WordFlags.UrlDomain)) { return totalFactor * 0.8; } - else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.UrlPath)) { + else if (theKeyword.wordMetadata.hasFlag(WordFlags.UrlPath)) { return totalFactor * 0.9; } @@ -167,7 +167,7 @@ public class SearchResultValuator { long maskDirect = ~0; long maskAdjacent = ~0; - byte excludeMask = (byte) (EdgePageWordFlags.Title.asBit() | EdgePageWordFlags.Subjects.asBit() | EdgePageWordFlags.Synthetic.asBit()); + byte excludeMask = (byte) (WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.Synthetic.asBit()); for (var keyword : keywordSet) { var meta = keyword.wordMetadata; @@ -213,15 +213,15 @@ public class SearchResultValuator { final double k = keyword.weight() / totalWeight; - EnumSet flags = keyword.flags(); + EnumSet flags = keyword.flags(); - final boolean title = flags.contains(EdgePageWordFlags.Title); - final boolean site = flags.contains(EdgePageWordFlags.Site); - final boolean siteAdjacent = flags.contains(EdgePageWordFlags.SiteAdjacent); - final boolean subject = flags.contains(EdgePageWordFlags.Subjects); - final boolean names = flags.contains(EdgePageWordFlags.NamesWords); - final boolean urlDomain = flags.contains(EdgePageWordFlags.UrlDomain); - final boolean urlPath = flags.contains(EdgePageWordFlags.UrlPath); + final boolean title = flags.contains(WordFlags.Title); + final boolean site = flags.contains(WordFlags.Site); + final boolean siteAdjacent = flags.contains(WordFlags.SiteAdjacent); + final boolean subject = flags.contains(WordFlags.Subjects); + final boolean names = flags.contains(WordFlags.NamesWords); + final boolean urlDomain = flags.contains(WordFlags.UrlDomain); + final boolean urlPath = flags.contains(WordFlags.UrlPath); if (title) { if (titleLength <= 64) { @@ -331,7 +331,7 @@ public class SearchResultValuator { return wordMetadata.tfIdf(); } - public EnumSet flags() { + public EnumSet flags() { return wordMetadata.flagSet(); } } diff --git a/code/services-core/search-service/src/main/resources/templates/search/search-result-metadata.hdb b/code/services-core/search-service/src/main/resources/templates/search/search-result-metadata.hdb index 9ccaa23e..a17d0864 100644 --- a/code/services-core/search-service/src/main/resources/templates/search/search-result-metadata.hdb +++ b/code/services-core/search-service/src/main/resources/templates/search/search-result-metadata.hdb @@ -1,3 +1,8 @@ {{#if problems}} ⚠ {{problemCount}} {{/if}} -{{positions}} + diff --git a/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java b/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java index 333eca48..aaa4a56a 100644 --- a/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java +++ b/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java @@ -2,8 +2,8 @@ package nu.marginalia.search.valuation; import nu.marginalia.index.client.model.results.SearchResultKeywordScore; import nu.marginalia.language.statistics.TermFrequencyDict; -import nu.marginalia.model.crawl.EdgePageDocumentFlags; -import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordMetadata; @@ -33,29 +33,29 @@ class SearchResultValuatorTest { } List titleOnlyLowCountSet = List.of( new SearchResultKeywordScore(0, "bob", - wordMetadata(32, Set.of(1), EnumSet.of(EdgePageWordFlags.Title)), - docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), + wordMetadata(32, Set.of(1), EnumSet.of(WordFlags.Title)), + docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)), false) ); List highCountNoTitleSet = List.of( new SearchResultKeywordScore(0, "bob", - wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh)), - docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), + wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)), + docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)), false) ); List highCountSubjectSet = List.of( new SearchResultKeywordScore(0, "bob", - wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh, EdgePageWordFlags.Subjects)), - docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), + wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)), + docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)), false) ); List first = List.of( new SearchResultKeywordScore(0, "bob", - wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(EdgePageWordFlags.TfIdfHigh)), - docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), + wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(WordFlags.TfIdfHigh)), + docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)), false) ); @@ -75,11 +75,11 @@ class SearchResultValuatorTest { System.out.println(highCountSubject); } - private long docMetadata(int topology, int year, int sets, int quality, EnumSet flags) { + private long docMetadata(int topology, int year, int sets, int quality, EnumSet flags) { return new DocumentMetadata(topology, PubDate.toYearByte(year), sets, quality, flags).encode(); } - private long wordMetadata(int tfIdf, Set positions, Set wordFlags) { + private long wordMetadata(int tfIdf, Set positions, Set wordFlags) { int posBits = positions.stream() .mapToInt(i -> (int)((1L << i) & 0xFFFF_FFFFL)) .reduce((a,b) -> a|b) diff --git a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java index 2bcafe4a..3f9ca32a 100644 --- a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java +++ b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java @@ -6,7 +6,7 @@ import lombok.SneakyThrows; import nu.marginalia.browse.DbBrowseDomainsRandom; import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; import nu.marginalia.browse.model.BrowseResult; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.dbcommon.DomainBlacklist; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.screenshot.ScreenshotService; @@ -25,7 +25,7 @@ import java.util.Map; import java.util.Optional; public class DatingService extends Service { - private final EdgeDomainBlacklist blacklist; + private final DomainBlacklist blacklist; private final DbBrowseDomainsSimilarCosine browseSimilarCosine; private final DbBrowseDomainsRandom browseRandom; private final MustacheRenderer datingRenderer; @@ -38,7 +38,7 @@ public class DatingService extends Service { RendererFactory rendererFactory, Initialization initialization, MetricsServer metricsServer, - EdgeDomainBlacklist blacklist, + DomainBlacklist blacklist, DbBrowseDomainsSimilarCosine browseSimilarCosine, DbBrowseDomainsRandom browseRandom, ScreenshotService screenshotService) { diff --git a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java index 89d0215d..695de6fb 100644 --- a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java +++ b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java @@ -4,7 +4,7 @@ import nu.marginalia.browse.DbBrowseDomainsRandom; import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.dbcommon.DomainBlacklist; import nu.marginalia.model.id.EdgeId; import java.util.LinkedList; @@ -22,14 +22,14 @@ public class DatingSessionObject { return current; } - public BrowseResult next(DbBrowseDomainsRandom random, EdgeDomainBlacklist blacklist) { + public BrowseResult next(DbBrowseDomainsRandom random, DomainBlacklist blacklist) { if (queue.isEmpty()) { random.getRandomDomains(25, blacklist, 0).forEach(queue::addLast); } return queue.pollFirst(); } - public BrowseResult nextSimilar(EdgeId id, DbBrowseDomainsSimilarCosine adjacent, EdgeDomainBlacklist blacklist) { + public BrowseResult nextSimilar(EdgeId id, DbBrowseDomainsSimilarCosine adjacent, DomainBlacklist blacklist) { adjacent.getDomainNeighborsAdjacentCosine(id, blacklist, 25).forEach(queue::addFirst); while (queue.size() > MAX_QUEUE_SIZE) { diff --git a/other/memex/build.gradle b/other/memex/build.gradle index 1ccf83a0..78b13789 100644 --- a/other/memex/build.gradle +++ b/other/memex/build.gradle @@ -62,7 +62,6 @@ dependencies { implementation project(':third-party') implementation project(':code:common:service') implementation project(':code:common:config') - implementation project(':code:libraries:misc') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') diff --git a/code/libraries/misc/src/main/java/nu/marginalia/util/FileSizeUtil.java b/other/memex/src/main/java/nu/marginalia/util/FileSizeUtil.java similarity index 100% rename from code/libraries/misc/src/main/java/nu/marginalia/util/FileSizeUtil.java rename to other/memex/src/main/java/nu/marginalia/util/FileSizeUtil.java diff --git a/settings.gradle b/settings.gradle index 912f8678..b871494c 100644 --- a/settings.gradle +++ b/settings.gradle @@ -14,13 +14,17 @@ include 'code:libraries:easy-lsh' include 'code:libraries:guarded-regex' include 'code:libraries:big-string' include 'code:libraries:random-write-funnel' -include 'code:libraries:misc' +include 'code:libraries:next-prime' +include 'code:libraries:braille-block-punch-cards' include 'code:libraries:language-processing' include 'code:features:screenshots' include 'code:features:random-websites' include 'code:features:domain-ranking' include 'code:features:query-parser' +include 'code:features:adblock' +include 'code:features:pubdate' +include 'code:features:topic-detection' include 'code:api:search-api' include 'code:api:index-api'