From aebb2652e83c6f775044105a7e3933dab31acdd1 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 27 Jul 2024 11:44:13 +0200 Subject: [PATCH] (wip) Extract and encode spans data Refactoring keyword extraction to extract spans information. Modifying the intermediate storage of converted data to use the new slop library, which is allows for easier storage of ad-hoc binary data like spans and positions. This is a bit of a katamari damacy commit that ended up dragging along a bunch of other fairly tangentially related changes that are hard to break out into separate commits after the fact. Will push as-is to get back to being able to do more isolated work. --- code/common/model/build.gradle | 1 + .../marginalia/model/idx/CodedWordSpan.java | 32 ++ .../nu/marginalia/model/idx/WordFlags.java | 18 +- .../nu/marginalia/model/idx/WordMetadata.java | 89 ---- .../nu/marginalia/model/WordMetadataTest.java | 41 -- code/execution/build.gradle | 6 +- .../actor/task/ConvertAndLoadActor.java | 36 +- .../java/nu/marginalia/svc/BackupService.java | 38 +- .../data-extractors/build.gradle | 2 +- .../nu/marginalia/extractor/AtagExporter.java | 6 +- .../nu/marginalia/extractor/FeedExporter.java | 6 +- .../extractor/TermFrequencyExporter.java | 4 +- .../keyword/DocumentKeywordExtractor.java | 61 ++- .../marginalia/keyword/KeywordMetadata.java | 4 +- .../keyword/model/DocumentKeywords.java | 30 +- .../model/DocumentKeywordsBuilder.java | 80 ++-- .../keyword/DocumentKeywordExtractorTest.java | 37 +- .../api/searchquery/QueryProtobufCodec.java | 9 +- .../results/SearchResultKeywordScore.java | 21 +- .../api/src/main/protobuf/query-api.proto | 3 +- code/index/build.gradle | 4 +- code/index/index-forward/build.gradle | 3 + .../index/forward/ForwardIndexConverter.java | 83 +++- .../index/forward/ForwardIndexFileNames.java | 5 + .../index/forward/ForwardIndexParameters.java | 4 +- .../index/forward/ForwardIndexReader.java | 15 +- .../forward/ForwardIndexSpansReader.java | 63 +++ .../forward/ForwardIndexSpansWriter.java | 53 +++ .../forward/ForwardIndexConverterTest.java | 62 +-- .../forward/ForwardIndexSpansReaderTest.java | 63 +++ .../test/nu/marginalia/test/TestUtil.java | 43 -- code/index/index-journal/build.gradle | 2 + .../index/journal/IndexJournal.java | 53 +++ .../index/journal/IndexJournalFileNames.java | 30 -- .../index/journal/IndexJournalPage.java | 76 +++ .../index/journal/IndexJournalSlopWriter.java | 105 ++++ .../journal/model/IndexJournalEntryData.java | 36 -- .../model/IndexJournalEntryHeader.java | 35 -- .../model/IndexJournalEntryTermData.java | 25 - .../journal/model/IndexJournalFileHeader.java | 10 - .../journal/reader/IndexJournalReadEntry.java | 111 ----- .../journal/reader/IndexJournalReader.java | 73 --- .../reader/IndexJournalReaderPagingImpl.java | 43 -- .../reader/IndexJournalReaderSingleFile.java | 116 ----- .../reader/pointer/IndexJournalPointer.java | 202 -------- .../journal/writer/IndexJournalWriter.java | 17 - .../writer/IndexJournalWriterPagingImpl.java | 68 --- .../IndexJournalWriterSingleFileImpl.java | 155 ------ .../index/journal/IndexJournalWriterTest.java | 448 ------------------ code/index/index-reverse/build.gradle | 3 + .../construction/JournalReaderSource.java | 10 - .../full/FullIndexConstructor.java | 23 +- .../index/construction/full/FullPreindex.java | 10 +- .../full/FullPreindexDocuments.java | 39 +- .../full/FullPreindexReference.java | 2 +- .../full/FullPreindexWordSegments.java | 12 +- .../prio/PrioIndexConstructor.java | 23 +- .../index/construction/prio/PrioPreindex.java | 13 +- .../prio/PrioPreindexDocuments.java | 34 +- .../prio/PrioPreindexReference.java | 2 +- .../prio/PrioPreindexWordSegments.java | 18 +- .../index/FullReverseIndexReaderTest.java | 25 +- .../full/FullPreindexDocsTest.java | 33 +- .../full/FullPreindexFinalizeTest.java | 18 +- .../full/FullPreindexMergeTest.java | 435 ----------------- .../full/FullPreindexWordSegmentsTest.java | 231 --------- .../construction/full/TestJournalFactory.java | 99 ++-- .../construction/full/TestSegmentData.java | 6 +- .../construction/prio/PrioPreindexTest.java | 14 +- .../test/nu/marginalia/test/TestUtil.java | 43 -- .../nu/marginalia/index/IndexFactory.java | 7 +- .../nu/marginalia/index/IndexGrpcService.java | 12 +- .../marginalia/index/index/StatefulIndex.java | 2 +- .../results/IndexResultScoreCalculator.java | 29 +- .../index/CombinedIndexReaderTest.java | 52 +- ...IndexQueryServiceIntegrationSmokeTest.java | 104 ++-- .../IndexQueryServiceIntegrationTest.java | 73 +-- ...ndexQueryServiceIntegrationTestModule.java | 24 +- .../nu/marginalia/index/util/TestUtil.java | 44 -- code/libraries/array/build.gradle | 2 + .../array/algo/LongArraySortNTest.java | 4 +- .../array/algo/LongArraySortTest.java | 2 +- .../nu/marginalia/util/test/TestUtil.java | 43 -- .../nu/marginalia/sequence/CodedSequence.java | 3 +- .../sequence/GammaCodedSequence.java | 2 +- .../language/sentence/tag/HtmlTag.java | 23 +- code/libraries/slop/build.gradle | 3 + .../dynamic/GammaCodedSequenceColumn.java | 121 +++++ .../dynamic/GammaCodedSequenceReader.java | 34 ++ .../dynamic/GammaCodedSequenceWriter.java | 11 + .../nu/marginalia/slop/desc/ColumnType.java | 1 + .../java}/nu/marginalia/test/TestUtil.java | 4 +- code/process-models/crawl-spec/build.gradle | 32 -- code/process-models/crawl-spec/readme.md | 16 - .../DocumentRecordParquetFileReader.java | 37 -- .../DocumentRecordParquetFileWriter.java | 24 - .../DomainLinkRecordParquetFileReader.java | 30 -- .../DomainLinkRecordParquetFileWriter.java | 24 - .../DomainRecordParquetFileReader.java | 31 -- .../DomainRecordParquetFileWriter.java | 24 - .../io/processed/ProcessedDataFileNames.java | 73 --- .../model/processed/DocumentRecord.java | 185 -------- .../DocumentRecordKeywordsProjection.java | 97 ---- .../DocumentRecordMetadataProjection.java | 100 ---- .../model/processed/DomainLinkRecord.java | 97 ---- .../model/processed/DomainRecord.java | 148 ------ .../model/processed/DomainWithIp.java | 15 - .../DocumentRecordParquetFileReaderTest.java | 107 ----- ...DomainLinkRecordParquetFileReaderTest.java | 49 -- .../DomainRecordParquetFileReaderTest.java | 69 --- code/process-models/work-log/build.gradle | 24 - .../processes/converting-process/build.gradle | 9 +- .../marginalia/converting/ConverterMain.java | 18 +- .../model/DisqualifiedException.java | 2 +- .../converting/processor/AcceptableAds.java | 2 +- .../processor/DocumentProcessor.java | 15 +- .../converting/processor/DomainProcessor.java | 14 +- .../processor/logic/DocumentValuator.java | 4 +- .../AbstractDocumentProcessorPlugin.java | 20 +- .../plugin/HtmlDocumentProcessorPlugin.java | 36 +- .../PlainTextDocumentProcessorPlugin.java | 24 +- .../sideload/SideloaderProcessing.java | 2 +- .../writer/ConverterBatchWriter.java | 89 ++-- .../converting-process/model}/build.gradle | 3 + .../io/processed/ProcessedDataFileNames.java | 16 + .../model/processed/SlopDocumentRecord.java | 395 +++++++++++++++ .../model/processed/SlopDomainLinkRecord.java | 83 ++++ .../model/processed/SlopDomainRecord.java | 240 ++++++++++ .../model/processed/SlopPageRef.java | 6 + .../marginalia/worklog/BatchingWorkLog.java | 0 .../worklog/BatchingWorkLogImpl.java | 0 .../worklog/BatchingWorkLogInspector.java | 0 .../converting-process/model}/readme.md | 0 .../worklog/BatchingWorkLogImplTest.java | 0 .../converting/ConvertingIntegrationTest.java | 12 +- ...CrawlingThenConvertingIntegrationTest.java | 10 +- code/processes/crawling-process/build.gradle | 6 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 20 +- .../crawl/retreival/CrawlDataReference.java | 4 +- .../retreival/CrawledDocumentFactory.java | 6 +- .../crawl/retreival/CrawlerRetreiver.java | 11 +- .../retreival/CrawlerWarcResynchronizer.java | 4 +- .../crawl/retreival/DomainProber.java | 2 +- .../retreival/fetcher/ContentTypeProber.java | 2 +- .../crawl/retreival/fetcher/HttpFetcher.java | 2 +- .../retreival/fetcher/HttpFetcherImpl.java | 6 +- .../retreival/fetcher/warc/WarcRecorder.java | 2 +- .../retreival/revisit/CrawlerRevisitor.java | 4 +- .../revisit/DocumentWithReference.java | 8 +- .../crawling-process/model}/build.gradle | 3 + .../crawlspec/CrawlSpecFileNames.java | 0 .../crawlspec/CrawlSpecGenerator.java | 0 .../io/crawldata}/CrawledDomainReader.java | 7 +- .../io/crawldata}/CrawlerOutputFile.java | 2 +- .../SerializableCrawlDataStream.java | 4 +- .../ParquetSerializableCrawlDataStream.java | 10 +- .../CrawlSpecRecordParquetFileReader.java | 0 .../CrawlSpecRecordParquetFileWriter.java | 0 .../model}/body/ContentTypeLogic.java | 2 +- .../model}/body/DocumentBodyExtractor.java | 4 +- .../model}/body/DocumentBodyResult.java | 4 +- .../model}/body/HttpFetchResult.java | 4 +- .../model/crawldata}/CrawledDocument.java | 2 +- .../model/crawldata}/CrawledDomain.java | 2 +- .../crawldata}/CrawlerDocumentStatus.java | 2 +- .../model/crawldata}/CrawlerDomainStatus.java | 2 +- .../crawldata}/SerializableCrawlData.java | 2 +- .../model/crawlspec/CrawlSpecRecord.java | 0 .../CrawledDocumentParquetRecord.java | 4 +- ...rawledDocumentParquetRecordFileReader.java | 2 +- ...rawledDocumentParquetRecordFileWriter.java | 8 +- .../jwarc/WarcXCookieInformationHeader.java | 0 .../netpreserve/jwarc/WarcXEntityRefused.java | 0 .../jwarc/WarcXResponseReference.java | 0 .../crawling-process/model}/readme.md | 0 .../crawling/model/CrawledDocumentTest.java | 4 +- ...edDocumentParquetRecordFileWriterTest.java | 10 +- ...edDocumentParquetRecordFileWriterTest.java | 2 +- .../retreival/fetcher/WarcRecorderTest.java | 9 +- .../revisit/DocumentWithReferenceTest.java | 2 +- .../marginalia/crawling/HttpFetcherTest.java | 6 +- .../retreival/CrawlerMockFetcherTest.java | 13 +- .../retreival/CrawlerRetreiverTest.java | 12 +- .../index-constructor-process/build.gradle | 2 +- .../index/IndexConstructorMain.java | 9 +- code/processes/loading-process/build.gradle | 7 +- .../loading/LoaderIndexJournalWriter.java | 62 ++- .../marginalia/loading/LoaderInputData.java | 28 +- .../documents/DocumentLoaderService.java | 61 ++- .../documents/KeywordLoaderService.java | 66 +-- .../loading/domains/DomainLoaderService.java | 113 +++-- .../links/DomainLinksLoaderService.java | 31 +- .../domains/DomainLoaderServiceTest.java | 102 ---- .../process-mq-api}/build.gradle | 2 + .../marginalia/mqapi/ProcessInboxNames.java | 0 .../mqapi/converting/ConvertAction.java | 0 .../mqapi/converting/ConvertRequest.java | 0 .../mqapi/crawling/CrawlRequest.java | 0 .../mqapi/index/CreateIndexRequest.java | 0 .../nu/marginalia/mqapi/index/IndexName.java | 0 .../marginalia/mqapi/loading/LoadRequest.java | 0 .../nu/marginalia/api/ApiSearchOperator.java | 14 +- .../search/model/ClusteredUrlDetails.java | 3 - .../control-service/build.gradle | 4 +- .../executor-service/build.gradle | 6 +- code/tools/experiment-runner/build.gradle | 2 +- .../java/nu/marginalia/tools/Experiment.java | 2 +- .../tools/ExperimentRunnerMain.java | 5 +- .../nu/marginalia/tools/LegacyExperiment.java | 6 +- .../tools/experiments/AdblockExperiment.java | 4 +- .../tools/experiments/AtagsExperiment.java | 2 +- .../experiments/DebugConverterExperiment.java | 2 +- .../ExportExternalLinksExperiment.java | 4 +- .../SentenceStatisticsExperiment.java | 3 +- .../experiments/SiteStatisticsExperiment.java | 2 +- .../tools/experiments/TestExperiment.java | 2 +- .../tools/experiments/TopicExperiment.java | 2 +- code/tools/integration-test/build.gradle | 8 +- .../test/nu/marginalia/IntegrationTest.java | 23 +- .../test/IntegrationTestModule.java | 8 +- settings.gradle | 11 +- 221 files changed, 2584 insertions(+), 4613 deletions(-) create mode 100644 code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java delete mode 100644 code/common/model/java/nu/marginalia/model/idx/WordMetadata.java delete mode 100644 code/common/model/test/nu/marginalia/model/WordMetadataTest.java create mode 100644 code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansReader.java create mode 100644 code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansWriter.java create mode 100644 code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java delete mode 100644 code/index/index-forward/test/nu/marginalia/test/TestUtil.java create mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/IndexJournal.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalFileNames.java create mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java create mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryTermData.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalFileHeader.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java delete mode 100644 code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java delete mode 100644 code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java delete mode 100644 code/index/index-reverse/java/nu/marginalia/index/construction/JournalReaderSource.java delete mode 100644 code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexMergeTest.java delete mode 100644 code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexWordSegmentsTest.java delete mode 100644 code/index/index-reverse/test/nu/marginalia/test/TestUtil.java delete mode 100644 code/index/test/nu/marginalia/index/util/TestUtil.java delete mode 100644 code/libraries/array/test/nu/marginalia/util/test/TestUtil.java create mode 100644 code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java create mode 100644 code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceReader.java create mode 100644 code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceWriter.java rename code/{tools/integration-test/test => libraries/test-helpers/java}/nu/marginalia/test/TestUtil.java (94%) delete mode 100644 code/process-models/crawl-spec/build.gradle delete mode 100644 code/process-models/crawl-spec/readme.md delete mode 100644 code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/io/processed/ProcessedDataFileNames.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecord.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/model/processed/DomainLinkRecord.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/model/processed/DomainRecord.java delete mode 100644 code/process-models/processed-data/java/nu/marginalia/model/processed/DomainWithIp.java delete mode 100644 code/process-models/processed-data/test/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java delete mode 100644 code/process-models/processed-data/test/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java delete mode 100644 code/process-models/processed-data/test/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java delete mode 100644 code/process-models/work-log/build.gradle rename code/{process-models/processed-data => processes/converting-process/model}/build.gradle (86%) create mode 100644 code/processes/converting-process/model/java/nu/marginalia/io/processed/ProcessedDataFileNames.java create mode 100644 code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java create mode 100644 code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java create mode 100644 code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java create mode 100644 code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopPageRef.java rename code/{process-models/work-log => processes/converting-process/model}/java/nu/marginalia/worklog/BatchingWorkLog.java (100%) rename code/{process-models/work-log => processes/converting-process/model}/java/nu/marginalia/worklog/BatchingWorkLogImpl.java (100%) rename code/{process-models/work-log => processes/converting-process/model}/java/nu/marginalia/worklog/BatchingWorkLogInspector.java (100%) rename code/{process-models/processed-data => processes/converting-process/model}/readme.md (100%) rename code/{process-models/work-log => processes/converting-process/model}/test/nu/marginalia/worklog/BatchingWorkLogImplTest.java (100%) rename code/{process-models/crawling-model => processes/crawling-process/model}/build.gradle (93%) rename code/{process-models/crawl-spec => processes/crawling-process/model}/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java (100%) rename code/{process-models/crawl-spec => processes/crawling-process/model}/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java (100%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/io => processes/crawling-process/model/java/nu/marginalia/io/crawldata}/CrawledDomainReader.java (86%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/io => processes/crawling-process/model/java/nu/marginalia/io/crawldata}/CrawlerOutputFile.java (98%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/io => processes/crawling-process/model/java/nu/marginalia/io/crawldata}/SerializableCrawlDataStream.java (94%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/io => processes/crawling-process/model/java/nu/marginalia/io/crawldata}/format/ParquetSerializableCrawlDataStream.java (95%) rename code/{process-models/crawl-spec => processes/crawling-process/model}/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java (100%) rename code/{process-models/crawl-spec => processes/crawling-process/model}/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java (100%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling => processes/crawling-process/model/java/nu/marginalia/model}/body/ContentTypeLogic.java (98%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling => processes/crawling-process/model/java/nu/marginalia/model}/body/DocumentBodyExtractor.java (96%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling => processes/crawling-process/model/java/nu/marginalia/model}/body/DocumentBodyResult.java (95%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling => processes/crawling-process/model/java/nu/marginalia/model}/body/HttpFetchResult.java (99%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/model => processes/crawling-process/model/java/nu/marginalia/model/crawldata}/CrawledDocument.java (98%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/model => processes/crawling-process/model/java/nu/marginalia/model/crawldata}/CrawledDomain.java (94%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/model => processes/crawling-process/model/java/nu/marginalia/model/crawldata}/CrawlerDocumentStatus.java (80%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/model => processes/crawling-process/model/java/nu/marginalia/model/crawldata}/CrawlerDomainStatus.java (64%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/model => processes/crawling-process/model/java/nu/marginalia/model/crawldata}/SerializableCrawlData.java (63%) rename code/{process-models/crawl-spec => processes/crawling-process/model}/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java (100%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/parquet => processes/crawling-process/model/java/nu/marginalia/parquet/crawldata}/CrawledDocumentParquetRecord.java (97%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/parquet => processes/crawling-process/model/java/nu/marginalia/parquet/crawldata}/CrawledDocumentParquetRecordFileReader.java (97%) rename code/{process-models/crawling-model/java/nu/marginalia/crawling/parquet => processes/crawling-process/model/java/nu/marginalia/parquet/crawldata}/CrawledDocumentParquetRecordFileWriter.java (97%) rename code/{process-models/crawling-model => processes/crawling-process/model}/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java (100%) rename code/{process-models/crawling-model => processes/crawling-process/model}/java/org/netpreserve/jwarc/WarcXEntityRefused.java (100%) rename code/{process-models/crawling-model => processes/crawling-process/model}/java/org/netpreserve/jwarc/WarcXResponseReference.java (100%) rename code/{process-models/crawling-model => processes/crawling-process/model}/readme.md (100%) rename code/{process-models/crawling-model => processes/crawling-process/model}/test/nu/marginalia/crawling/model/CrawledDocumentTest.java (94%) rename code/{process-models/crawling-model => processes/crawling-process/model}/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java (90%) delete mode 100644 code/processes/loading-process/test/nu/marginalia/loading/domains/DomainLoaderServiceTest.java rename code/{process-mqapi => processes/process-mq-api}/build.gradle (91%) rename code/{process-mqapi => processes/process-mq-api}/java/nu/marginalia/mqapi/ProcessInboxNames.java (100%) rename code/{process-mqapi => processes/process-mq-api}/java/nu/marginalia/mqapi/converting/ConvertAction.java (100%) rename code/{process-mqapi => processes/process-mq-api}/java/nu/marginalia/mqapi/converting/ConvertRequest.java (100%) rename code/{process-mqapi => processes/process-mq-api}/java/nu/marginalia/mqapi/crawling/CrawlRequest.java (100%) rename code/{process-mqapi => processes/process-mq-api}/java/nu/marginalia/mqapi/index/CreateIndexRequest.java (100%) rename code/{process-mqapi => processes/process-mq-api}/java/nu/marginalia/mqapi/index/IndexName.java (100%) rename code/{process-mqapi => processes/process-mq-api}/java/nu/marginalia/mqapi/loading/LoadRequest.java (100%) diff --git a/code/common/model/build.gradle b/code/common/model/build.gradle index a424efca..3b9d87c3 100644 --- a/code/common/model/build.gradle +++ b/code/common/model/build.gradle @@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:braille-block-punch-cards') + implementation project(':code:libraries:coded-sequence') implementation libs.bundles.slf4j diff --git a/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java b/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java new file mode 100644 index 00000000..70a3e832 --- /dev/null +++ b/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java @@ -0,0 +1,32 @@ +package nu.marginalia.model.idx; + +import nu.marginalia.sequence.CodedSequence; + +import java.util.List; + +public record CodedWordSpan(byte code, CodedSequence spans) { + public static SplitSpansList fromSplit(String codes, List spans) { + return new SplitSpansList(codes, spans); + } + public static SplitSpansList split(List spanList) { + return new SplitSpansList( + spanList.stream() + .map(CodedWordSpan::code) + .collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(), + spanList.stream() + .map(CodedWordSpan::spans) + .toList() + ); + } + + public record SplitSpansList(String codes, List spans) { + public List unite() { + if (null == codes) { + return List.of(); + } + else { + return codes.chars().mapToObj(c -> new CodedWordSpan((byte) c, spans.get(codes.indexOf(c)))).toList(); + } + } + } +} diff --git a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java index f9016c48..77baed4c 100644 --- a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java +++ b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java @@ -38,19 +38,27 @@ public enum WordFlags { ExternalLink ; - public int asBit() { - return 1 << ordinal(); + public byte asBit() { + return (byte) (1 << ordinal()); } - public boolean isPresent(long value) { + public boolean isPresent(byte value) { return (asBit() & value) > 0; } - public boolean isAbsent(long value) { + public boolean isAbsent(byte value) { return (asBit() & value) == 0; } - public static EnumSet decode(long encodedValue) { + public static byte encode(EnumSet flags) { + byte ret = 0; + for (WordFlags f : flags) { + ret |= f.asBit(); + } + return ret; + } + + public static EnumSet decode(byte encodedValue) { EnumSet ret = EnumSet.noneOf(WordFlags.class); for (WordFlags f : values()) { diff --git a/code/common/model/java/nu/marginalia/model/idx/WordMetadata.java b/code/common/model/java/nu/marginalia/model/idx/WordMetadata.java deleted file mode 100644 index 1f1add44..00000000 --- a/code/common/model/java/nu/marginalia/model/idx/WordMetadata.java +++ /dev/null @@ -1,89 +0,0 @@ -package nu.marginalia.model.idx; - - -import nu.marginalia.bbpc.BrailleBlockPunchCards; - -import java.util.EnumSet; -import java.util.Set; - -/** Word level metadata designed to fit in a single 64 bit long. - * - * @param positions bitmask of term positions within the document - * @param flags word flags (see {@link WordFlags}) - */ -public record WordMetadata(long positions, - int flags) { - - public static final long FLAGS_MASK = (1L << WordFlags.values().length) - 1; - public static final int POSITIONS_COUNT = 64 - WordFlags.values().length; - public static final int POSITIONS_SHIFT = WordFlags.values().length; - public static final long POSITIONS_MASK = ~0L >>> POSITIONS_SHIFT; - - - - public WordMetadata() { - this(emptyValue()); - } - - public WordMetadata(long value) { - this( - ((value >>> POSITIONS_SHIFT) & POSITIONS_MASK), - (int)(value & FLAGS_MASK) - ); - } - - public WordMetadata(long positions, - Set flags) - { - this(positions, encodeFlags(flags)); - } - - private static int encodeFlags(Set flags) { - int ret = 0; - for (var flag : flags) { ret |= flag.asBit(); } - return ret; - } - - public static boolean hasFlags(long encoded, long metadataBitMask) { - return (encoded & metadataBitMask) == metadataBitMask; - } - public static boolean hasAnyFlags(long encoded, long metadataBitMask) { - return (encoded & metadataBitMask) != 0; - } - public static long decodePositions(long meta) { - return (meta >>> POSITIONS_SHIFT) & POSITIONS_MASK; - } - - public boolean hasFlag(WordFlags flag) { - return (flags & flag.asBit()) != 0; - } - - public String toString() { - return "[positions=%s; %s]".formatted(BrailleBlockPunchCards.printBits(positions, 56), flagSet()); - } - - /* Encoded in a 64 bit long - */ - public long encode() { - long ret = 0; - - ret |= Integer.toUnsignedLong(flags) & FLAGS_MASK; - ret |= (positions & POSITIONS_MASK) << POSITIONS_SHIFT; - - return ret; - } - - public boolean isEmpty() { - return positions == 0 && flags == 0; - } - - public static long emptyValue() { - return 0L; - } - - - public EnumSet flagSet() { - return WordFlags.decode(flags); - } - -} diff --git a/code/common/model/test/nu/marginalia/model/WordMetadataTest.java b/code/common/model/test/nu/marginalia/model/WordMetadataTest.java deleted file mode 100644 index 6de3179b..00000000 --- a/code/common/model/test/nu/marginalia/model/WordMetadataTest.java +++ /dev/null @@ -1,41 +0,0 @@ -package nu.marginalia.model; - -import nu.marginalia.bbpc.BrailleBlockPunchCards; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; -import org.junit.jupiter.api.Test; - -import java.util.EnumSet; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -class WordMetadataTest { - - @Test - public void codecTest() { - verifyCodec("Vanilla case", new WordMetadata(0x7f0f0000L, EnumSet.allOf(WordFlags.class))); - verifyCodec("Position 32bit", new WordMetadata(0xff0f0000L, EnumSet.allOf(WordFlags.class))); - verifyCodec("Position all", new WordMetadata(0xffff_ff0f_0000L, EnumSet.allOf(WordFlags.class))); - verifyCodec("No flags", new WordMetadata( 0xff0f0000L, EnumSet.noneOf(WordFlags.class))); - verifyCodec("No flags, some bits", new WordMetadata(0x3f_7f7f_7f7f_7f7fL, EnumSet.noneOf(WordFlags.class))); - verifyCodec("No flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.noneOf(WordFlags.class))); - verifyCodec("All flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.allOf(WordFlags.class))); - System.out.println(new WordMetadata(0x7f0f0005L, EnumSet.allOf(WordFlags.class))); - System.out.println(new WordMetadata(0xff0f0013L, EnumSet.noneOf(WordFlags.class))); - System.out.println(new WordMetadata(0xf0f000ff0f0013L, EnumSet.allOf(WordFlags.class))); - System.out.println(new WordMetadata(0xf0f000ff0f0013L, (byte)-1)); - System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0)); - System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0)); - System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(~0L, (byte) 0).encode(), 64)); - System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(0, (byte) 0xff).encode(), 64)); - System.out.println(BrailleBlockPunchCards.printBits(131973L, 64)); - System.out.println(new WordMetadata(131973L)); - } - - public void verifyCodec(String message, WordMetadata data) { - System.out.println(BrailleBlockPunchCards.printBits(data.encode(), 64)); - assertEquals(data, new WordMetadata(data.encode()), message); - } - - -} \ No newline at end of file diff --git a/code/execution/build.gradle b/code/execution/build.gradle index 973f13c9..354334f3 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -38,15 +38,15 @@ dependencies { implementation project(':code:functions:search-query') implementation project(':code:execution:api') - implementation project(':code:process-models:crawl-spec') - implementation project(':code:process-models:crawling-model') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:crawling-process:model') implementation project(':code:features-crawl:link-parser') implementation project(':code:features-convert:data-extractors') implementation project(':code:features-convert:stackexchange-xml') implementation project(':code:features-convert:reddit-json') implementation project(':code:index:index-journal') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':third-party:encyclopedia-marginalia-nu') implementation libs.bundles.slf4j diff --git a/code/execution/java/nu/marginalia/actor/task/ConvertAndLoadActor.java b/code/execution/java/nu/marginalia/actor/task/ConvertAndLoadActor.java index 45b7d77a..b508d84e 100644 --- a/code/execution/java/nu/marginalia/actor/task/ConvertAndLoadActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ConvertAndLoadActor.java @@ -6,19 +6,11 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.With; +import nu.marginalia.IndexLocations; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorResumeBehavior; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.Resume; -import nu.marginalia.nodecfg.NodeConfigurationService; -import nu.marginalia.process.ProcessOutboxes; -import nu.marginalia.process.ProcessService; -import nu.marginalia.service.module.ServiceConfiguration; -import nu.marginalia.storage.model.FileStorageState; -import nu.marginalia.svc.BackupService; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageId; -import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.index.api.IndexMqClient; import nu.marginalia.index.api.IndexMqEndpoints; import nu.marginalia.mq.MqMessageState; @@ -27,9 +19,20 @@ import nu.marginalia.mqapi.converting.ConvertRequest; import nu.marginalia.mqapi.index.CreateIndexRequest; import nu.marginalia.mqapi.index.IndexName; import nu.marginalia.mqapi.loading.LoadRequest; +import nu.marginalia.nodecfg.NodeConfigurationService; +import nu.marginalia.process.ProcessOutboxes; +import nu.marginalia.process.ProcessService; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.storage.model.FileStorageState; +import nu.marginalia.storage.model.FileStorageType; +import nu.marginalia.svc.BackupService; +import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.file.Files; import java.sql.SQLException; import java.util.List; @@ -113,6 +116,21 @@ public class ConvertAndLoadActor extends RecordActorPrototype { yield new Load(List.of(processedId)); } case Load(List processedIds, long msgId) when msgId < 0 -> { + // clear the output directory of the loader from any debris from partial jobs that have been aborted + Files.list(IndexLocations.getIndexConstructionArea(storageService)).forEach(path -> { + try { + if (Files.isDirectory(path)) { + FileUtils.deleteDirectory(path.toFile()); + } + else if (Files.isRegularFile(path)) { + Files.delete(path); + } + } catch (Exception e) { + logger.error("Error clearing staging area", e); + } + }); + + long id = mqLoaderOutbox.sendAsync(new LoadRequest(processedIds)); yield new Load(processedIds, id); diff --git a/code/execution/java/nu/marginalia/svc/BackupService.java b/code/execution/java/nu/marginalia/svc/BackupService.java index 23b95f6c..e6c2f0da 100644 --- a/code/execution/java/nu/marginalia/svc/BackupService.java +++ b/code/execution/java/nu/marginalia/svc/BackupService.java @@ -2,22 +2,25 @@ package nu.marginalia.svc; import com.github.luben.zstd.ZstdInputStream; import com.github.luben.zstd.ZstdOutputStream; +import com.google.inject.Inject; import nu.marginalia.IndexLocations; +import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.linkdb.LinkdbFileNames; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; -import nu.marginalia.index.journal.IndexJournalFileNames; +import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; -import com.google.inject.Inject; +import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.time.LocalDateTime; import java.util.List; +import java.util.Optional; public class BackupService { @@ -97,35 +100,20 @@ public class BackupService { private void backupJournal(Path inputStorage, Path backupStorage) throws IOException { - for (var source : IndexJournalFileNames.findJournalFiles(inputStorage)) { - var dest = backupStorage.resolve(source.toFile().getName()); - - try (var is = Files.newInputStream(source); - var os = Files.newOutputStream(dest) - ) { - IOUtils.copyLarge(is, os); - } + Optional journal = IndexJournal.findJournal(inputStorage); + if (journal.isEmpty()) { + throw new FileNotFoundException("No journal found in input storage"); } + FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile()); } private void restoreJournal(Path destStorage, Path backupStorage) throws IOException { - - // Remove any old journal files first to avoid them getting loaded - for (var garbage : IndexJournalFileNames.findJournalFiles(destStorage)) { - Files.delete(garbage); + Optional journal = IndexJournal.findJournal(backupStorage); + if (journal.isEmpty()) { + throw new FileNotFoundException("No journal found in backup"); } - - for (var source : IndexJournalFileNames.findJournalFiles(backupStorage)) { - var dest = destStorage.resolve(source.toFile().getName()); - - try (var is = Files.newInputStream(source); - var os = Files.newOutputStream(dest) - ) { - IOUtils.copyLarge(is, os); - } - } - + FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile()); } private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException diff --git a/code/features-convert/data-extractors/build.gradle b/code/features-convert/data-extractors/build.gradle index f8841120..82bf536a 100644 --- a/code/features-convert/data-extractors/build.gradle +++ b/code/features-convert/data-extractors/build.gradle @@ -24,7 +24,7 @@ dependencies { implementation project(':code:libraries:blocking-thread-pool') implementation project(':code:features-crawl:link-parser') implementation project(':code:features-convert:anchor-keywords') - implementation project(':code:process-models:crawling-model') + implementation project(':code:processes:crawling-process:model') implementation project(':code:processes:converting-process') implementation project(':third-party:commons-codec') diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java index acc3a417..d2f2c91b 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java @@ -3,13 +3,13 @@ package nu.marginalia.extractor; import com.google.inject.Inject; import gnu.trove.set.hash.TLongHashSet; import lombok.SneakyThrows; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java index fa925b39..547b810b 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java @@ -2,13 +2,13 @@ package nu.marginalia.extractor; import com.google.inject.Inject; import lombok.SneakyThrows; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.link_parser.FeedExtractor; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index 4283a657..2545d666 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -5,11 +5,11 @@ import gnu.trove.map.hash.TLongIntHashMap; import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.io.crawldata.CrawledDomainReader; import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index 4c1f0edd..8e28b550 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -7,14 +7,16 @@ import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.model.EdgeUrl; import nu.marginalia.term_frequency_dict.TermFrequencyDict; +import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; +import java.util.List; import java.util.stream.Stream; - public class DocumentKeywordExtractor { private final KeywordExtractor keywordExtractor; @@ -93,7 +95,7 @@ public class DocumentKeywordExtractor { var word = rep.word; if (!word.isBlank()) { - long meta = metadata.getMetadataForWord(rep.stemmed); + byte meta = metadata.getMetadataForWord(rep.stemmed); wordsBuilder.addMeta(word, meta); } } @@ -105,7 +107,13 @@ public class DocumentKeywordExtractor { { // we use 1-based indexing since the data // will be gamma encoded, and it can't represent 0 - int pos = 1; + int pos = 0; + + List spanRecorders = List.of( + new SpanRecorder(HtmlTag.TITLE), + new SpanRecorder(HtmlTag.HEADING), + new SpanRecorder(HtmlTag.CODE) + ); for (DocumentSentence sent : dld) { @@ -113,6 +121,12 @@ public class DocumentKeywordExtractor { break; for (var word : sent) { + pos++; + + for (var recorder : spanRecorders) { + recorder.update(sent, pos); + } + if (word.isStopWord()) { continue; } @@ -120,7 +134,7 @@ public class DocumentKeywordExtractor { String w = word.wordLowerCase(); if (matchesWordPattern(w)) { /* Add information about term positions */ - wordsBuilder.addPos(w, pos++); + wordsBuilder.addPos(w, pos); /* Add metadata for word */ wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); @@ -130,11 +144,16 @@ public class DocumentKeywordExtractor { for (var names : keywordExtractor.getProperNames(sent)) { var rep = new WordRep(sent, names); - long meta = metadata.getMetadataForWord(rep.stemmed); + byte meta = metadata.getMetadataForWord(rep.stemmed); wordsBuilder.addMeta(rep.word, meta); } + } + pos++; // we need to add one more position to account for the last word in the document + + for (var recorder : spanRecorders) { + wordsBuilder.addSpans(recorder.finish(pos)); } } @@ -176,4 +195,36 @@ public class DocumentKeywordExtractor { return false; } + + /** Helper class to record spans of words */ + private static class SpanRecorder { + private List spans = new ArrayList<>(); + private final HtmlTag htmlTag; + private int start = 0; + + public SpanRecorder(HtmlTag htmlTag) { + this.htmlTag = htmlTag; + } + + public void update(DocumentSentence sentence, int pos) { + assert pos > 0; + + if (sentence.htmlTags.contains(htmlTag)) { + if (start <= 0) start = pos; + } + else { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); + start = -1; + } + } + } + + public List finish(int length) { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); + } + return spans; + } + } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java index 0bf5043a..b27e0676 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java @@ -27,9 +27,9 @@ class KeywordMetadata { this.urlKeywords = urlKeywords; } - public long getMetadataForWord(String stemmed) { + public byte getMetadataForWord(String stemmed) { - long flags = 0; + byte flags = 0; if (subjectLikeKeywords.contains(stemmed)) { flags |= WordFlags.Subjects.asBit(); diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java index 40a51cd3..d8167422 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java @@ -1,36 +1,36 @@ package nu.marginalia.keyword.model; +import nu.marginalia.model.idx.CodedWordSpan; import nu.marginalia.sequence.CodedSequence; -import java.io.Serial; -import java.io.Serializable; +import java.util.List; -public final class DocumentKeywords implements Serializable { +public final class DocumentKeywords { - @Serial - private static final long serialVersionUID = 1387282293082091432L; + public final List keywords; + public final byte[] metadata; + public final List positions; + public final List spans; - public final String[] keywords; - public final long[] metadata; - public final CodedSequence[] positions; - - public DocumentKeywords(String[] keywords, - long[] metadata, - CodedSequence[] positions) + public DocumentKeywords(List keywords, + byte[] metadata, + List positions, + List spans) { this.keywords = keywords; this.metadata = metadata; this.positions = positions; + this.spans = spans; - assert keywords.length == metadata.length; + assert keywords.size() == metadata.length; } public boolean isEmpty() { - return keywords.length == 0; + return keywords.isEmpty(); } public int size() { - return keywords.length; + return keywords.size(); } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 90870c53..49d090d0 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -1,11 +1,13 @@ package nu.marginalia.keyword.model; +import gnu.trove.list.array.TByteArrayList; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntList; -import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; +import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap; import lombok.Getter; +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.model.idx.CodedWordSpan; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.GammaCodedSequence; import org.slf4j.Logger; @@ -16,8 +18,9 @@ import java.util.*; @Getter public class DocumentKeywordsBuilder { - public final Object2LongLinkedOpenHashMap wordToMeta; + public final Object2ByteOpenHashMap wordToMeta; public final HashMap wordToPos; + public final Map> wordSpans = new HashMap<>(); /** These ware keywords that had signals of high relevance */ public final Set importantWords = new HashSet<>(); @@ -35,17 +38,17 @@ public class DocumentKeywordsBuilder { } public DocumentKeywords build(ByteBuffer workArea) { - final String[] wordArray = new String[wordToMeta.size()]; - final long[] meta = new long[wordToMeta.size()]; - final CodedSequence[] positions = new CodedSequence[wordToMeta.size()]; + final List wordArray = new ArrayList<>(wordToMeta.size()); + final TByteArrayList meta = new TByteArrayList(wordToMeta.size()); + final List positions = new ArrayList<>(wordToMeta.size()); - var iter = wordToMeta.object2LongEntrySet().fastIterator(); + var iter = wordToMeta.object2ByteEntrySet().fastIterator(); - for (int i = 0; iter.hasNext(); i++) { + while (iter.hasNext()) { var entry = iter.next(); - meta[i] = entry.getLongValue(); - wordArray[i] = entry.getKey(); + meta.add(entry.getByteValue()); + wordArray.add(entry.getKey()); var posList = wordToPos.getOrDefault(entry.getKey(), IntList.of()); @@ -53,18 +56,33 @@ public class DocumentKeywordsBuilder { posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear(); } - positions[i] = GammaCodedSequence.generate(workArea, posList); + positions.add(GammaCodedSequence.generate(workArea, posList)); } - return new DocumentKeywords(wordArray, meta, positions); + // Encode spans + List spans = new ArrayList<>(wordSpans.size()); + + wordSpans.forEach((tag, spansForTag) -> { + spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start)); + + var positionsForTag = new IntArrayList(spansForTag.size()*2); + for (var span : spansForTag) { + positionsForTag.add(span.start()); + positionsForTag.add(span.end()); + } + + spans.add(new CodedWordSpan((byte) tag.charValue(), GammaCodedSequence.generate(workArea, positionsForTag))); + }); + + return new DocumentKeywords(wordArray, meta.toArray(), positions, spans); } public DocumentKeywordsBuilder(int capacity) { - wordToMeta = new Object2LongLinkedOpenHashMap<>(capacity); + wordToMeta = new Object2ByteOpenHashMap<>(capacity); wordToPos = new HashMap<>(capacity); } - public void addMeta(String word, long meta) { + public void addMeta(String word, byte meta) { if (word.length() > MAX_WORD_LENGTH) return; @@ -84,12 +102,12 @@ public class DocumentKeywordsBuilder { public void setFlagOnMetadataForWords(WordFlags flag, Collection flagWords) { flagWords.forEach(word -> - wordToMeta.mergeLong(word, flag.asBit(), (a, b) -> a|b) + wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte)(a|b)) ); } public void addAllSyntheticTerms(Collection newWords) { - long meta = WordFlags.Synthetic.asBit(); + byte meta = WordFlags.Synthetic.asBit(); // Only add the synthetic flag if the words aren't already present @@ -97,17 +115,17 @@ public class DocumentKeywordsBuilder { } public void addAnchorTerms(Map keywords) { - long flagA = WordFlags.ExternalLink.asBit(); - long flagB = flagA | WordFlags.Site.asBit(); - long flagC = flagB | WordFlags.SiteAdjacent.asBit(); + byte flagA = WordFlags.ExternalLink.asBit(); + byte flagB = (byte) (flagA | WordFlags.Site.asBit()); + byte flagC = (byte) (flagB | WordFlags.SiteAdjacent.asBit()); keywords.forEach((word, count) -> { if (count > 5) { - wordToMeta.mergeLong(word, flagC, (a, b) -> a|b); + wordToMeta.mergeByte(word, flagC, (a, b) -> (byte) (a|b)); } else if (count > 2) { - wordToMeta.mergeLong(word, flagB, (a, b) -> a|b); + wordToMeta.mergeByte(word, flagB, (a, b) -> (byte) (a|b)); } else { - wordToMeta.mergeLong(word, flagA, (a, b) -> a|b); + wordToMeta.mergeByte(word, flagA, (a, b) -> (byte) (a|b)); } }); } @@ -115,9 +133,9 @@ public class DocumentKeywordsBuilder { public List getWordsWithAnyFlag(long flags) { List ret = new ArrayList<>(); - for (var iter = wordToMeta.object2LongEntrySet().fastIterator(); iter.hasNext();) { + for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext();) { var entry = iter.next(); - if ((flags & entry.getLongValue()) != 0) { + if ((flags & entry.getByteValue()) != 0) { ret.add(entry.getKey()); } } @@ -125,21 +143,27 @@ public class DocumentKeywordsBuilder { return ret; } + public void addSpans(List newSpans) { + for (var span : newSpans) { + wordSpans.computeIfAbsent(span.tag().code, k -> new ArrayList<>()).add(span); + } + } + public int size() { return Math.max(wordToMeta.size(), wordToPos.size()); } - public WordMetadata getMetaForWord(String word) { - return new WordMetadata(wordToMeta.getLong(word)); - } @Override public String toString() { StringBuilder sb = new StringBuilder("[ "); + wordToMeta.forEach((word, meta) -> { - sb.append(word).append("->").append(new WordMetadata(meta).flagSet()).append(',').append(wordToPos.getOrDefault(word, new IntArrayList())).append(' '); + sb.append(word).append("->").append(WordFlags.decode(meta)).append(',').append(wordToPos.getOrDefault(word, new IntArrayList())).append(' '); }); return sb.append(']').toString(); } + public record DocumentWordSpan(HtmlTag tag, int start, int end) { + } } diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java index 0d731227..71c3befe 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -4,9 +4,8 @@ import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.sequence.CodedSequence; -import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; @@ -53,30 +52,11 @@ class DocumentKeywordExtractorTest { keywords.getWordToMeta().forEach((k, v) -> { if (k.contains("_")) { - System.out.println(k + " " + new WordMetadata(v)); + System.out.println(k + " " + WordFlags.decode(v)); } }); } - @Test - public void testKeyboards() throws IOException, URISyntaxException { - var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), - "Could not load word frequency table"); - String html = new String(resource.readAllBytes(), Charset.defaultCharset()); - var doc = Jsoup.parse(html); - doc.filter(new DomPruningFilter(0.5)); - var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/")); - System.out.println(keywords.getMetaForWord("mechanical")); - System.out.println(keywords.getMetaForWord("keyboard")); - System.out.println(keywords.getMetaForWord("keyboards")); - - System.out.println(new WordMetadata(8894889328781L)); - System.out.println(new WordMetadata(4294967297L)); - System.out.println(new WordMetadata(566820053975498886L)); - // - - System.out.println(new WordMetadata(1198298103937L)); - System.out.println(new WordMetadata(1103808168065L)); - } @Test public void testMadonna() throws IOException, URISyntaxException { @@ -93,16 +73,17 @@ class DocumentKeywordExtractorTest { var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024)); - Map flags = new HashMap<>(); + Map flags = new HashMap<>(); Map positions = new HashMap<>(); for (int i = 0; i < keywordsBuilt.size(); i++) { - String keyword = keywordsBuilt.keywords[i]; - long metadata = keywordsBuilt.metadata[i]; + String keyword = keywordsBuilt.keywords.get(i); + byte metadata = keywordsBuilt.metadata[i] + ; if (Set.of("dirty", "blues").contains(keyword)) { - flags.put(keyword, new WordMetadata(metadata)); - positions.put(keyword, keywordsBuilt.positions[i]); + flags.put(keyword, metadata); + positions.put(keyword, keywordsBuilt.positions.get(i)); } } @@ -127,7 +108,5 @@ class DocumentKeywordExtractorTest { new TermFrequencyDict(WmsaHome.getLanguageModels())); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online")); - System.out.println(keywords.getMetaForWord("knitting")); } } \ No newline at end of file diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 46681de4..691d374a 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -1,6 +1,9 @@ package nu.marginalia.api.searchquery; import lombok.SneakyThrows; +import nu.marginalia.api.searchquery.model.query.ProcessedQuery; +import nu.marginalia.api.searchquery.model.query.QueryParams; +import nu.marginalia.api.searchquery.model.query.QueryResponse; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; @@ -11,9 +14,6 @@ import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.api.searchquery.model.query.ProcessedQuery; -import nu.marginalia.api.searchquery.model.query.QueryParams; -import nu.marginalia.api.searchquery.model.query.QueryResponse; import java.util.ArrayList; @@ -197,7 +197,8 @@ public class QueryProtobufCodec { return new SearchResultKeywordScore( keywordScores.getKeyword(), -1, // termId is internal to index service - keywordScores.getEncodedWordMetadata() + (byte) keywordScores.getFlags(), + keywordScores.getPositions() ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java index 212b2302..b04d65df 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java @@ -1,40 +1,32 @@ package nu.marginalia.api.searchquery.model.results; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; import java.util.Objects; public final class SearchResultKeywordScore { public final long termId; public final String keyword; - private final long encodedWordMetadata; + public byte flags; + public int positionCount; public SearchResultKeywordScore(String keyword, long termId, - long encodedWordMetadata) { + byte flags, + int positionCount) { this.termId = termId; this.keyword = keyword; - this.encodedWordMetadata = encodedWordMetadata; } public boolean hasTermFlag(WordFlags flag) { - return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); + return (flags & flag.asBit()) != 0; } - public long positions() { - return WordMetadata.decodePositions(encodedWordMetadata); - } - public boolean isKeywordSpecial() { return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic); } - public long encodedWordMetadata() { - return encodedWordMetadata; - } - @Override public boolean equals(Object obj) { if (obj == this) return true; @@ -51,8 +43,7 @@ public final class SearchResultKeywordScore { @Override public String toString() { return "SearchResultKeywordScore[" + - "keyword=" + keyword + ", " + - "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']'; + "keyword=" + keyword + ']'; } } diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index 642b28ed..ee6e669b 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -108,7 +108,8 @@ message RpcRawResultItem { /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { string keyword = 1; // the keyword - int64 encodedWordMetadata = 2; // bit encoded word metadata + int32 flags = 2; + int32 positions = 3; } /* Query execution parameters */ diff --git a/code/index/build.gradle b/code/index/build.gradle index 2f1cde13..db4dab20 100644 --- a/code/index/build.gradle +++ b/code/index/build.gradle @@ -30,8 +30,9 @@ dependencies { implementation project(':code:common:linkdb') implementation project(':code:common:service') - implementation project(':code:functions:search-query:api') + implementation project(':code:processes:converting-process:model') + implementation project(':code:functions:search-query:api') implementation project(':code:index:index-forward') implementation project(':code:index:index-reverse') implementation project(':code:index:query') @@ -73,4 +74,5 @@ dependencies { testImplementation project(':code:libraries:test-helpers') testImplementation project(':code:libraries:term-frequency-dict') testImplementation project(':code:libraries:braille-block-punch-cards') + testImplementation project(':code:libraries:test-helpers') } diff --git a/code/index/index-forward/build.gradle b/code/index/index-forward/build.gradle index 83e0cdc2..3506281f 100644 --- a/code/index/index-forward/build.gradle +++ b/code/index/index-forward/build.gradle @@ -15,11 +15,13 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:slop') implementation project(':code:libraries:coded-sequence') implementation project(':code:index:query') implementation project(':code:index:index-journal') implementation project(':code:common:model') implementation project(':code:common:process') + implementation project(':code:processes:converting-process:model') implementation libs.bundles.slf4j @@ -28,6 +30,7 @@ dependencies { implementation libs.fastutil implementation libs.trove + testImplementation project(':code:libraries:test-helpers') testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java index 7c3704ba..2edc283f 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -1,19 +1,21 @@ package nu.marginalia.index.forward; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; +import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.array.LongArray; +import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.slop.column.primitive.LongColumnReader; import org.roaringbitmap.longlong.LongConsumer; import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; @@ -23,22 +25,25 @@ public class ForwardIndexConverter { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final IndexJournalReader journalReader; private final Path outputFileDocsId; private final Path outputFileDocsData; private final DomainRankings domainRankings; + private final Path outputFileSpansData; + private final IndexJournal journal; public ForwardIndexConverter(ProcessHeartbeat heartbeat, - IndexJournalReader journalReader, Path outputFileDocsId, Path outputFileDocsData, + Path outputFileSpansData, + IndexJournal journal, DomainRankings domainRankings ) { this.heartbeat = heartbeat; - this.journalReader = journalReader; this.outputFileDocsId = outputFileDocsId; this.outputFileDocsData = outputFileDocsData; + this.outputFileSpansData = outputFileSpansData; + this.journal = journal; this.domainRankings = domainRankings; } @@ -58,7 +63,7 @@ public class ForwardIndexConverter { try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) { progress.progress(TaskSteps.GET_DOC_IDS); - LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); + LongArray docsFileId = getDocIds(outputFileDocsId, journal); progress.progress(TaskSteps.GATHER_OFFSETS); @@ -73,20 +78,55 @@ public class ForwardIndexConverter { LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); - var pointer = journalReader.newPointer(); - while (pointer.nextDocument()) { - long docId = pointer.documentId(); - int domainId = UrlIdCodec.getDomainId(docId); + ByteBuffer workArea = ByteBuffer.allocate(65536); + for (var instance : journal.pages()) { + try (var docIdReader = instance.openCombinedId(); + var metaReader = instance.openDocumentMeta(); + var featuresReader = instance.openFeatures(); + var sizeReader = instance.openSize(); - long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId); + var spansCodesReader = instance.openSpanCodes(); + var spansSeqReader = instance.openSpans(); + var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData) + ) + { + while (docIdReader.hasRemaining()) { + long docId = docIdReader.get(); + int domainId = UrlIdCodec.getDomainId(docId); - int ranking = domainRankings.getRanking(domainId); - long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking); + long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId); - long features = pointer.documentFeatures() | ((long) pointer.documentSize() << 32L); + int ranking = domainRankings.getRanking(domainId); + long meta = DocumentMetadata.encodeRank(metaReader.get(), ranking); - docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); - docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features); + final int docFeatures = featuresReader.get(); + final int docSize = sizeReader.get(); + + long features = docFeatures | ((long) docSize << 32L); + + // Write spans data + byte[] spansCodes = spansCodesReader.get(); + + spansWriter.beginRecord(spansCodes.length); + + for (int i = 0; i < spansCodes.length; i++) { + workArea.clear(); + spansSeqReader.getData(workArea); + workArea.flip(); + + spansWriter.writeSpan(spansCodes[i], workArea); + } + + long encodedSpansOffset = spansWriter.endRecord(); + + + // Write the principal forward documents file + docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); + docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features); + docFileData.set(entryOffset + ForwardIndexParameters.SPANS_OFFSET, encodedSpansOffset); + + } + } } progress.progress(TaskSteps.FORCE); @@ -104,9 +144,16 @@ public class ForwardIndexConverter { } } - private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException { + private LongArray getDocIds(Path outputFileDocs, IndexJournal journalReader) throws IOException { Roaring64Bitmap rbm = new Roaring64Bitmap(); - journalReader.forEachDocId(rbm::add); + + for (var instance : journalReader.pages()) { + try (LongColumnReader idReader = instance.openCombinedId()) { + while (idReader.hasRemaining()) { + rbm.add(idReader.get()); + } + } + } LongArray ret = LongArrayFactory.mmapForWritingConfined(outputFileDocs, rbm.getIntCardinality()); rbm.forEach(new LongConsumer() { diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexFileNames.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexFileNames.java index e16e8618..6231256e 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexFileNames.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexFileNames.java @@ -13,6 +13,10 @@ public class ForwardIndexFileNames { case NEXT -> basePath.resolve("fwd-doc-data.dat.next"); case CURRENT -> basePath.resolve("fwd-doc-data.dat"); }; + case SPANS_DATA -> switch (version) { + case NEXT -> basePath.resolve("fwd-spans.dat.next"); + case CURRENT -> basePath.resolve("fwd-spans.dat"); + }; }; } @@ -23,6 +27,7 @@ public class ForwardIndexFileNames { public enum FileIdentifier { DOC_DATA, + SPANS_DATA, DOC_ID } } diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java index 0b306050..cef76eb0 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java @@ -1,8 +1,8 @@ package nu.marginalia.index.forward; class ForwardIndexParameters { - public static final int ENTRY_SIZE = 2; + public static final int ENTRY_SIZE = 3; public static final int METADATA_OFFSET = 0; public static final int FEATURES_OFFSET = 1; - + public static final int SPANS_OFFSET = 2; } diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java index f9393b45..902c7344 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -29,19 +29,31 @@ public class ForwardIndexReader { private final TLongIntHashMap idToOffset; private final LongArray data; + private final ForwardIndexSpansReader spansReader; private final Logger logger = LoggerFactory.getLogger(getClass()); - public ForwardIndexReader(Path idsFile, Path dataFile) throws IOException { + public ForwardIndexReader(Path idsFile, + Path dataFile, + Path spansFile) throws IOException { if (!Files.exists(dataFile)) { logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile); idToOffset = null; data = null; + spansReader = null; return; } else if (!Files.exists(idsFile)) { logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile); idToOffset = null; data = null; + spansReader = null; + return; + } + else if (!Files.exists(spansFile)) { + logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile); + idToOffset = null; + data = null; + spansReader = null; return; } @@ -49,6 +61,7 @@ public class ForwardIndexReader { idToOffset = loadIds(idsFile); data = loadData(dataFile); + spansReader = new ForwardIndexSpansReader(spansFile); } private static TLongIntHashMap loadIds(Path idsFile) throws IOException { diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansReader.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansReader.java new file mode 100644 index 00000000..a670658d --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansReader.java @@ -0,0 +1,63 @@ +package nu.marginalia.index.forward; + +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.sequence.GammaCodedSequence; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.List; + +@SuppressWarnings("preview") +public class ForwardIndexSpansReader implements AutoCloseable { + private final FileChannel spansFileChannel; + + public ForwardIndexSpansReader(Path spansFile) throws IOException { + this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ); + } + + public List readSpans(Arena arena, long encodedOffset) throws IOException { + long size = encodedOffset & 0xFFF_FFFF; + long offset = encodedOffset >>> 28; + + var buffer = arena.allocate(size).asByteBuffer(); + buffer.clear(); + while (buffer.hasRemaining()) { + spansFileChannel.read(buffer, offset + buffer.position()); + } + buffer.flip(); + + int count = buffer.get(); + + List ret = new ArrayList<>(); + while (count-- > 0) { + byte code = buffer.get(); + short len = buffer.getShort(); + + final int pos = buffer.position(); + + // Decode the gamma-coded sequence; this will advance the buffer position + // in a not entirely predictable way, so we need to save the position + buffer.limit(buffer.position() + len); + var sequence = new GammaCodedSequence(buffer).values(); + ret.add(new SpanData(code, sequence)); + + // Reset the buffer position to the end of the span + buffer.position(pos + len); + buffer.limit(buffer.capacity()); + } + + return ret; + } + + @Override + public void close() throws IOException { + spansFileChannel.close(); + } + + public record SpanData(byte code, IntList data) {} +} diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansWriter.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansWriter.java new file mode 100644 index 00000000..973257c0 --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansWriter.java @@ -0,0 +1,53 @@ +package nu.marginalia.index.forward; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class ForwardIndexSpansWriter implements AutoCloseable { + private final FileChannel outputChannel; + private final ByteBuffer work = ByteBuffer.allocate(32); + + private long stateStartOffset = -1; + private int stateLength = -1; + + public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException { + this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + } + + public void beginRecord(int count) throws IOException { + stateStartOffset = outputChannel.position(); + stateLength = 0; + + work.clear(); + work.put((byte) count); + work.flip(); + + while (work.hasRemaining()) + stateLength += outputChannel.write(work); + } + + public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException { + work.clear(); + work.put(spanCode); + work.putShort((short) sequenceData.remaining()); + work.flip(); + + while (work.hasRemaining() || sequenceData.hasRemaining()) { + stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData}); + } + } + + public long endRecord() { + return stateStartOffset << 28 | stateLength; + + } + + @Override + public void close() throws IOException { + outputChannel.close(); + } +} diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java index 39b8dec1..0c5255d5 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -2,15 +2,11 @@ package nu.marginalia.index.forward; import lombok.SneakyThrows; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.process.control.FakeProcessHeartbeat; -import nu.marginalia.sequence.CodedSequence; -import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -21,85 +17,94 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.stream.IntStream; +import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; class ForwardIndexConverterTest { - IndexJournalWriter writer; + IndexJournalSlopWriter writer; - Path indexFile; Path wordsFile1; Path urlsFile1; Path dictionaryFile; + Path workDir; + private final Logger logger = LoggerFactory.getLogger(getClass()); Path dataDir; private Path docsFileId; private Path docsFileData; + private Path docsSpanData; int workSetSize = 512; @BeforeEach @SneakyThrows void setUp() { + + workDir = Files.createTempDirectory(getClass().getSimpleName()); + dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - indexFile = Files.createTempFile("tmp", ".idx"); - indexFile.toFile().deleteOnExit(); - writer = new IndexJournalWriterSingleFileImpl(indexFile); - wordsFile1 = Files.createTempFile("words1", ".idx"); urlsFile1 = Files.createTempFile("urls1", ".idx"); dataDir = Files.createTempDirectory(getClass().getSimpleName()); - for (int i = 1; i < workSetSize; i++) { - createEntry(writer, i); + try (var writer = new IndexJournalSlopWriter(IndexJournal.allocateName(workDir), 0)) { + for (int i = 1; i < workSetSize; i++) { + createEntry(writer, i); + } } - writer.close(); - - docsFileId = dataDir.resolve("docs-i.dat"); docsFileData = dataDir.resolve("docs-d.dat"); + docsSpanData = dataDir.resolve("docs-s.dat"); } @AfterEach public void tearDown() { TestUtil.clearTempDir(dataDir); + TestUtil.clearTempDir(workDir); } long createId(long url, long domain) { return UrlIdCodec.encodeId((int) domain, (int) url); } - public void createEntry(IndexJournalWriter writer, int id) { + public void createEntry(IndexJournalSlopWriter writer, int id) { writer.put( - new IndexJournalEntryHeader(createId(id, id/20), + createId(id, id/20), + new SlopDocumentRecord.KeywordsProjection( + "", + -1, id%3, + id%5, 15, - (id % 5)), - new IndexJournalEntryData( - new String[]{}, - new long[]{}, - new CodedSequence[]{} + List.of(), + new byte[0], + List.of(), + new byte[0], + List.of() ) ); + + } @Test void testForwardIndex() throws IOException { new ForwardIndexConverter(new FakeProcessHeartbeat(), - new IndexJournalReaderSingleFile(indexFile), docsFileId, docsFileData, + docsSpanData, + IndexJournal.findJournal(workDir).orElseThrow(), new DomainRankings()).convert(); - var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); + var forwardReader = new ForwardIndexReader(docsFileId, docsFileData, docsSpanData); for (int i = 36; i < workSetSize; i++) { long docId = createId(i, i/20); @@ -108,5 +113,4 @@ class ForwardIndexConverterTest { assertEquals(i/20, UrlIdCodec.getDomainId(docId)); } } - } \ No newline at end of file diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java new file mode 100644 index 00000000..b77a0f5a --- /dev/null +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java @@ -0,0 +1,63 @@ +package nu.marginalia.index.forward; + +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.sequence.GammaCodedSequence; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class ForwardIndexSpansReaderTest { + Path testFile = Files.createTempFile("test", ".idx"); + + ForwardIndexSpansReaderTest() throws IOException { + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(testFile); + } + + @Test + void testSunnyDay() throws IOException { + ByteBuffer wa = ByteBuffer.allocate(32); + + long offset1; + long offset2; + try (var writer = new ForwardIndexSpansWriter(testFile)) { + writer.beginRecord(1); + writer.writeSpan((byte) 'a', GammaCodedSequence.generate(wa, 1, 3, 5).buffer()); + offset1 = writer.endRecord(); + + writer.beginRecord(2); + writer.writeSpan((byte) 'b', GammaCodedSequence.generate(wa, 2, 4, 6).buffer()); + writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 3, 5, 7).buffer()); + offset2 = writer.endRecord(); + } + + try (var reader = new ForwardIndexSpansReader(testFile); + var arena = Arena.ofConfined() + ) { + var spans1 = reader.readSpans(arena, offset1); + var spans2 = reader.readSpans(arena, offset2); + + assertEquals(1, spans1.size()); + + assertEquals('a', spans1.get(0).code()); + assertEquals(IntList.of(1, 3, 5), spans1.get(0).data()); + + assertEquals(2, spans2.size()); + + assertEquals('b', spans2.get(0).code()); + assertEquals(IntList.of(2, 4, 6), spans2.get(0).data()); + assertEquals('c', spans2.get(1).code()); + assertEquals(IntList.of(3, 5, 7), spans2.get(1).data()); + } + } +} \ No newline at end of file diff --git a/code/index/index-forward/test/nu/marginalia/test/TestUtil.java b/code/index/index-forward/test/nu/marginalia/test/TestUtil.java deleted file mode 100644 index 8fbf6b54..00000000 --- a/code/index/index-forward/test/nu/marginalia/test/TestUtil.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.test; - - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; - -public class TestUtil { - public static void clearTempDir(Path dir) { - if (Files.isDirectory(dir)) { - for (File f : dir.toFile().listFiles()) { - File[] files = f.listFiles(); - if (files != null) { - Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); - } - System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); - f.delete(); - } - } - System.out.println("Deleting " + dir); - dir.toFile().delete(); - } - - private static String fileSize(Path path) { - try { - long sizeBytes = Files.size(path); - - if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; - if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; - if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; - return sizeBytes + "b"; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - - private static String round(double d) { - return String.format("%.2f", d); - } -} diff --git a/code/index/index-journal/build.gradle b/code/index/index-journal/build.gradle index 7274b8b2..b63f2b23 100644 --- a/code/index/index-journal/build.gradle +++ b/code/index/index-journal/build.gradle @@ -15,7 +15,9 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:array') + implementation project(':code:libraries:slop') implementation project(':code:common:model') + implementation project(':code:processes:converting-process:model') implementation project(':third-party:parquet-floor') implementation project(':third-party:commons-codec') diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournal.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournal.java new file mode 100644 index 00000000..aca9b060 --- /dev/null +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournal.java @@ -0,0 +1,53 @@ +package nu.marginalia.index.journal; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public record IndexJournal(Path journalDir) { + + public static final String JOURNAL_FILE_NAME = "index-journal"; + + public static Path allocateName(Path base) { + return base.resolve(JOURNAL_FILE_NAME); + } + + /** Returns the journal file in the base directory. */ + public static Optional findJournal(Path baseDirectory) { + Path journal = baseDirectory.resolve(JOURNAL_FILE_NAME); + if (Files.isDirectory(journal)) { + return Optional.of(new IndexJournal(journal)); + } + return Optional.empty(); + } + + /** Returns the number of versions of the journal file in the base directory. */ + public static int numPages(Path baseDirectory) { + for (int version = 0; ; version++) { + if (!IndexJournalPage.combinedId.forPage(version).exists(baseDirectory)) { + return version; + } + } + + } + + public IndexJournal { + if (!journalDir.toFile().isDirectory()) { + throw new IllegalArgumentException("Invalid journal directory: " + journalDir); + } + } + + public List pages() { + int pages = numPages(journalDir); + + List instances = new ArrayList<>(pages); + + for (int version = 0; version < pages; version++) { + instances.add(new IndexJournalPage(journalDir, version)); + } + + return instances; + } +} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalFileNames.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalFileNames.java deleted file mode 100644 index 8702be34..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalFileNames.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.index.journal; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -public class IndexJournalFileNames { - public static Path allocateName(Path base, int idx) { - return base.resolve(String.format("page-index-%04d.dat", idx)); - } - - public static List findJournalFiles(Path baseDirectory) throws IOException { - List ret = new ArrayList<>(); - - try (var listStream = Files.list(baseDirectory)) { - listStream - .filter(IndexJournalFileNames::isJournalFile) - .sorted() - .forEach(ret::add); - } - - return ret; - } - - public static boolean isJournalFile(Path file) { - return file.toFile().getName().matches("page-index-\\d{4}.dat"); - } -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java new file mode 100644 index 00000000..8b8d7c2e --- /dev/null +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java @@ -0,0 +1,76 @@ +package nu.marginalia.index.journal; + +import nu.marginalia.slop.column.array.ByteArrayColumnReader; +import nu.marginalia.slop.column.array.ByteArrayColumnWriter; +import nu.marginalia.slop.column.dynamic.GammaCodedSequenceReader; +import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter; +import nu.marginalia.slop.column.dynamic.VarintColumnReader; +import nu.marginalia.slop.column.dynamic.VarintColumnWriter; +import nu.marginalia.slop.column.primitive.*; +import nu.marginalia.slop.desc.ColumnDesc; +import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.StorageType; + +import java.io.IOException; +import java.nio.file.Path; + +public record IndexJournalPage(Path baseDir, int page) { + public static final ColumnDesc features = new ColumnDesc<>("features", ColumnType.INT_LE, StorageType.PLAIN); + public static final ColumnDesc size = new ColumnDesc<>("size", ColumnType.INT_LE, StorageType.PLAIN); + public static final ColumnDesc combinedId = new ColumnDesc<>("combinedId", ColumnType.LONG_LE, StorageType.PLAIN); + public static final ColumnDesc documentMeta = new ColumnDesc<>("documentMeta", ColumnType.LONG_LE, StorageType.PLAIN); + + public static final ColumnDesc termCounts = new ColumnDesc<>("termCounts", ColumnType.VARINT_LE, StorageType.PLAIN); + public static final ColumnDesc termIds = new ColumnDesc<>("termIds", ColumnType.LONG_LE, StorageType.ZSTD); + public static final ColumnDesc termMeta = new ColumnDesc<>("termMetadata", ColumnType.BYTE, StorageType.ZSTD); + public static final ColumnDesc positions = new ColumnDesc<>("termPositions", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD); + + public static final ColumnDesc spanCodes = new ColumnDesc<>("spanCodes", ColumnType.BYTE_ARRAY, StorageType.ZSTD); + public static final ColumnDesc spans = new ColumnDesc<>("spans", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD); + + public IndexJournalPage { + if (!baseDir.toFile().isDirectory()) { + throw new IllegalArgumentException("Invalid base directory: " + baseDir); + } + } + + public LongColumnReader openCombinedId() throws IOException { + return combinedId.forPage(page).open(baseDir); + } + + public LongColumnReader openDocumentMeta() throws IOException { + return documentMeta.forPage(page).open(baseDir); + } + + public IntColumnReader openFeatures() throws IOException { + return features.forPage(page).open(baseDir); + } + + public IntColumnReader openSize() throws IOException { + return size.forPage(page).open(baseDir); + } + + public LongColumnReader openTermCounts() throws IOException { + return termCounts.forPage(page).open(baseDir); + } + + public LongColumnReader openTermIds() throws IOException { + return termIds.forPage(page).open(baseDir); + } + + public ByteColumnReader openTermMetadata() throws IOException { + return termMeta.forPage(page).open(baseDir); + } + + public GammaCodedSequenceReader openTermPositions() throws IOException { + return positions.forPage(page).open(baseDir); + } + + public GammaCodedSequenceReader openSpans() throws IOException { + return spans.forPage(page).open(baseDir); + } + + public ByteArrayColumnReader openSpanCodes() throws IOException { + return spanCodes.forPage(page).open(baseDir); + } +} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java new file mode 100644 index 00000000..10e4edd6 --- /dev/null +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java @@ -0,0 +1,105 @@ +package nu.marginalia.index.journal; + +import lombok.SneakyThrows; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.slop.column.array.ByteArrayColumnWriter; +import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter; +import nu.marginalia.slop.column.primitive.ByteColumnWriter; +import nu.marginalia.slop.column.primitive.IntColumnWriter; +import nu.marginalia.slop.column.primitive.LongColumnWriter; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +public class IndexJournalSlopWriter implements AutoCloseable { + + private final IntColumnWriter featuresWriter; + private final IntColumnWriter sizeWriter; + private final LongColumnWriter combinedIdWriter; + private final LongColumnWriter documentMetaWriter; + + private final LongColumnWriter termCountsWriter; + private final LongColumnWriter termIdsWriter; + private final ByteColumnWriter termMetadataWriter; + private final GammaCodedSequenceWriter termPositionsWriter; + + private final GammaCodedSequenceWriter spansWriter; + private final ByteArrayColumnWriter spanCodesWriter; + + private static final MurmurHash3_128 hash = new MurmurHash3_128(); + + public IndexJournalSlopWriter(Path dir, int page) throws IOException { + if (!Files.exists(dir)) { + Files.createDirectory(dir); + } + + + featuresWriter = IndexJournalPage.features.forPage(page).create(dir); + sizeWriter = IndexJournalPage.size.forPage(page).create(dir); + + combinedIdWriter = IndexJournalPage.combinedId.forPage(page).create(dir); + documentMetaWriter = IndexJournalPage.documentMeta.forPage(page).create(dir); + + termCountsWriter = IndexJournalPage.termCounts.forPage(page).create(dir); + termIdsWriter = IndexJournalPage.termIds.forPage(page).create(dir); + termMetadataWriter = IndexJournalPage.termMeta.forPage(page).create(dir); + termPositionsWriter = IndexJournalPage.positions.forPage(page).create(dir); + + spansWriter = IndexJournalPage.spans.forPage(page).create(dir); + spanCodesWriter = IndexJournalPage.spanCodes.forPage(page).create(dir); + } + + @SneakyThrows + public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) { + + combinedIdWriter.put(combinedId); + featuresWriter.put(keywordsProjection.htmlFeatures()); + sizeWriter.put(keywordsProjection.length()); + documentMetaWriter.put(keywordsProjection.documentMetadata()); + + // -- write keyword data -- + + final List keywords = keywordsProjection.words(); + byte[] termMetadata = keywordsProjection.metas(); + + termCountsWriter.put(keywords.size()); + + // termIds are the special hashes of the keywords + long[] termIds = new long[keywordsProjection.words().size()]; + for (int i = 0; i < termIds.length; i++) { + termIds[i] = hash.hashKeyword(keywords.get(i)); + } + + List termPositions = keywordsProjection.positions(); + for (int i = 0; i < termMetadata.length; i++) { + termMetadataWriter.put(termMetadata[i]); + termIdsWriter.put(termIds[i]); + termPositionsWriter.put((GammaCodedSequence) termPositions.get(i)); + } + + // -- write spans -- + + spanCodesWriter.put(keywordsProjection.spanCodes()); + for (var span : keywordsProjection.spans()) { + spansWriter.put((GammaCodedSequence) span); + } + } + + public void close() throws IOException { + featuresWriter.close(); + sizeWriter.close(); + combinedIdWriter.close(); + documentMetaWriter.close(); + termCountsWriter.close(); + termIdsWriter.close(); + termMetadataWriter.close(); + termPositionsWriter.close(); + spansWriter.close(); + spanCodesWriter.close(); + } +} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java deleted file mode 100644 index 6fc5e8cf..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java +++ /dev/null @@ -1,36 +0,0 @@ -package nu.marginalia.index.journal.model; - -import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.sequence.CodedSequence; - -public record IndexJournalEntryData(long[] termIds, - long[] metadata, - CodedSequence[] positions) { - - public IndexJournalEntryData { - assert termIds.length == metadata.length; - assert termIds.length == positions.length; - } - - public IndexJournalEntryData(String[] keywords, - long[] metadata, - CodedSequence[] positions) - { - this(termIds(keywords), metadata, positions); - } - - private static final MurmurHash3_128 hash = new MurmurHash3_128(); - - public int size() { - return termIds.length; - } - - - private static long[] termIds(String[] keywords) { - long[] termIds = new long[keywords.length]; - for (int i = 0; i < keywords.length; i++) { - termIds[i] = hash.hashKeyword(keywords[i]); - } - return termIds; - } -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java deleted file mode 100644 index 82dc904a..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java +++ /dev/null @@ -1,35 +0,0 @@ -package nu.marginalia.index.journal.model; - -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.DocumentMetadata; - -/** The header of an index journal entry. - * - * @param entrySize the size of the entry - * @param documentFeatures the features of the document, as an encoded HtmlFeature - * @param combinedId the combined document id, encoded with UrlIdCodec - * @param documentMeta the metadata of the document, as an encoded DocumentMetadata - * - * @see DocumentMetadata - * @see HtmlFeature - * @see UrlIdCodec - */ -public record IndexJournalEntryHeader(int entrySize, - int documentFeatures, - int documentSize, - long combinedId, - long documentMeta) { - - public IndexJournalEntryHeader(long combinedId, - int documentFeatures, - int documentSize, - long documentMeta) { - this(-1, - documentFeatures, - documentSize, - combinedId, - documentMeta); - } - -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryTermData.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryTermData.java deleted file mode 100644 index 3fec11a0..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryTermData.java +++ /dev/null @@ -1,25 +0,0 @@ -package nu.marginalia.index.journal.model; - -import nu.marginalia.sequence.CodedSequence; -import nu.marginalia.sequence.GammaCodedSequence; - -import java.nio.ByteBuffer; - -/** Data corresponding to a term in a document in the index journal. - * - * @param termId the id of the term - * @param metadata the metadata of the term - * @param positionsBuffer buffer holding positions of the word in the document, gamma coded - * - * @see GammaCodedSequence - */ -public record IndexJournalEntryTermData( - long termId, - long metadata, - ByteBuffer positionsBuffer) -{ - public CodedSequence positions() { - return new GammaCodedSequence(positionsBuffer); - } - -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalFileHeader.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalFileHeader.java deleted file mode 100644 index 7a4ca7e0..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalFileHeader.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.index.journal.model; - -/** The header of an index journal file. This is the first 16 bytes of the file, - * and is not compressed. - * - * @param fileSizeRecords the size of the file in number of records - * @param reserved should be 0 - */ -public record IndexJournalFileHeader(long fileSizeRecords, long reserved) { -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java deleted file mode 100644 index e5756bf4..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java +++ /dev/null @@ -1,111 +0,0 @@ -package nu.marginalia.index.journal.reader; - -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.model.IndexJournalEntryTermData; -import nu.marginalia.model.id.UrlIdCodec; - -import java.io.DataInputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.Iterator; - -public class IndexJournalReadEntry implements Iterable { - public final IndexJournalEntryHeader header; - - private final ByteBuffer buffer; - - private final int initialPos; - - public IndexJournalReadEntry(IndexJournalEntryHeader header, ByteBuffer buffer) { - this.header = header; - this.buffer = buffer; - this.initialPos = buffer.position(); - } - - public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException { - - final int entrySize = (inputStream.readShort() & 0xFFFF); - final int docSize = inputStream.readShort(); - final int docFeatures = inputStream.readInt(); - final long docId = inputStream.readLong(); - final long meta = inputStream.readLong(); - - var header = new IndexJournalEntryHeader( - entrySize, - docFeatures, - docSize, - docId, - meta); - - byte[] buffer = new byte[entrySize]; - inputStream.readFully(buffer); - return new IndexJournalReadEntry(header, ByteBuffer.wrap(buffer)); - } - - public long docId() { - return header.combinedId(); - } - - public long docMeta() { - return header.documentMeta(); - } - - public int documentFeatures() { - return header.documentFeatures(); - } - - public int documentSize() { - return header.documentSize(); - } - - public int domainId() { - return UrlIdCodec.getDomainId(docId()); - } - - public void reset() { - buffer.position(initialPos); - } - - public Iterator iterator() { - return new TermDataIterator(buffer, initialPos); - } - -} - -class TermDataIterator implements Iterator { - private final ByteBuffer buffer; - - // Pointer alias to buffer, used to reduce slice() allocation overhead in the iterator - private final ByteBuffer alias; - - TermDataIterator(ByteBuffer buffer, int initialPos) { - this.buffer = buffer; - this.buffer.position(initialPos); - this.alias = buffer.duplicate(); - } - - @Override - public boolean hasNext() { - return buffer.position() < buffer.limit(); - } - - @Override - public IndexJournalEntryTermData next() { - // read the metadata for the term - long termId = buffer.getLong(); - long meta = buffer.getShort(); - - // read the size of the sequence data - int size = buffer.getShort() & 0xFFFF; - - // position the alias buffer to the term data - alias.limit(buffer.position() + size); - alias.position(buffer.position()); - - // advance the buffer position to the next term - buffer.position(buffer.position() + size); - - return new IndexJournalEntryTermData(termId, meta, alias); - } - -} \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java deleted file mode 100644 index a0cbe2e0..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java +++ /dev/null @@ -1,73 +0,0 @@ -package nu.marginalia.index.journal.reader; - -import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.function.LongConsumer; -import java.util.function.LongPredicate; - -/** Tools for reading the index journal. */ -public interface IndexJournalReader { - int FILE_HEADER_SIZE_LONGS = 2; - int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS; - - int DOCUMENT_HEADER_SIZE_BYTES = 24; - int TERM_HEADER_SIZE_BYTES = 12; - - /** Create a reader for a single file. */ - static IndexJournalReader singleFile(Path fileName) throws IOException { - return new IndexJournalReaderSingleFile(fileName); - } - - /** Create a reader for a set of files. */ - static IndexJournalReader paging(Path baseDir) throws IOException { - return new IndexJournalReaderPagingImpl(baseDir); - } - - default void forEachWordId(LongConsumer consumer) { - var ptr = this.newPointer(); - while (ptr.nextDocument()) { - for (var termData : ptr) { - consumer.accept(termData.termId()); - } - } - } - - default void forEachDocId(LongConsumer consumer) throws IOException { - try (var ptr = this.newPointer()) { - while (ptr.nextDocument()) { - consumer.accept(ptr.documentId()); - } - } - } - - /** Create a new pointer to the journal. The IndexJournalPointer is - * a two-tiered iterator that allows both iteration over document records - * and the terms within each document. - */ - IndexJournalPointer newPointer(); - - /** Reader that filters the entries based on the term metadata. */ - default IndexJournalReader filtering(LongPredicate termMetaFilter) { - return new FilteringIndexJournalReader(this, termMetaFilter); - } - -} - -class FilteringIndexJournalReader implements IndexJournalReader { - private final IndexJournalReader base; - private final LongPredicate termMetaFilter; - - FilteringIndexJournalReader(IndexJournalReader base, LongPredicate termMetaFilter) { - this.base = base; - this.termMetaFilter = termMetaFilter; - } - - @Override - public IndexJournalPointer newPointer() { - return base - .newPointer() - .filterWordMeta(termMetaFilter); - } -} \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java deleted file mode 100644 index 8a4361fa..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.index.journal.reader; - -import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer; -import nu.marginalia.index.journal.IndexJournalFileNames; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -public class IndexJournalReaderPagingImpl implements IndexJournalReader { - - private static final Logger logger = LoggerFactory.getLogger(IndexJournalReaderPagingImpl.class); - private final List readers; - - public IndexJournalReaderPagingImpl(Path baseDir) throws IOException { - this(IndexJournalFileNames.findJournalFiles(baseDir)); - - if (readers.isEmpty()) - logger.warn("Creating paging index journal file in {}, found no inputs!", baseDir); - else - logger.info("Creating paging index journal reader for {} inputs", readers.size()); - } - - public IndexJournalReaderPagingImpl(List inputFiles) throws IOException { - this.readers = new ArrayList<>(inputFiles.size()); - - for (var inputFile : inputFiles) { - readers.add(new IndexJournalReaderSingleFile(inputFile)); - } - } - - @Override - public IndexJournalPointer newPointer() { - return IndexJournalPointer.concatenate( - readers.stream() - .map(IndexJournalReader::newPointer) - .toArray(IndexJournalPointer[]::new) - ); - } -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java deleted file mode 100644 index 4598a538..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java +++ /dev/null @@ -1,116 +0,0 @@ -package nu.marginalia.index.journal.reader; - -import com.github.luben.zstd.ZstdInputStream; -import lombok.SneakyThrows; -import nu.marginalia.index.journal.model.IndexJournalEntryTermData; -import nu.marginalia.index.journal.model.IndexJournalFileHeader; -import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer; -import org.jetbrains.annotations.NotNull; - -import java.io.*; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.util.Iterator; - -public class IndexJournalReaderSingleFile implements IndexJournalReader { - - private final Path journalFile; - public final IndexJournalFileHeader fileHeader; - - @Override - public String toString() { - return "IndexJournalReaderSingleCompressedFile{" + journalFile + " }"; - } - - public IndexJournalReaderSingleFile(Path file) throws IOException { - this.journalFile = file; - - fileHeader = readHeader(file); - } - - private static IndexJournalFileHeader readHeader(Path file) throws IOException { - try (var raf = new RandomAccessFile(file.toFile(), "r")) { - long recordCount = raf.readLong(); - long unused = raf.readLong(); - - return new IndexJournalFileHeader(recordCount, unused); - } - } - - private static DataInputStream createInputStream(Path file) throws IOException { - var fileInputStream = Files.newInputStream(file, StandardOpenOption.READ); - - // skip the header - fileInputStream.skipNBytes(16); - - return new DataInputStream(new ZstdInputStream(new BufferedInputStream(fileInputStream))); - } - - @SneakyThrows - @Override - public IndexJournalPointer newPointer() { - return new SingleFileJournalPointer(fileHeader, createInputStream(journalFile)); - } - -} - -class SingleFileJournalPointer implements IndexJournalPointer { - - private final IndexJournalFileHeader fileHeader; - private final DataInputStream dataInputStream; - private IndexJournalReadEntry entry; - private int docIdx = -1; - - public SingleFileJournalPointer( - IndexJournalFileHeader fileHeader, - DataInputStream dataInputStream) - { - this.fileHeader = fileHeader; - this.dataInputStream = dataInputStream; - } - - @SneakyThrows - @Override - public boolean nextDocument() { - if (++docIdx < fileHeader.fileSizeRecords()) { - entry = IndexJournalReadEntry.read(dataInputStream); - return true; - } - - dataInputStream.close(); - - return false; - } - - @Override - public long documentId() { - return entry.docId(); - } - - @Override - public long documentMeta() { - return entry.docMeta(); - } - - - @Override - public int documentFeatures() { return entry.documentFeatures(); } - - @Override - public int documentSize() { return entry.documentSize(); } - - /** Return an iterator over the terms in the current document. - * This iterator is not valid after calling nextDocument(). - */ - @NotNull - @Override - public Iterator iterator() { - return entry.iterator(); - } - - @Override - public void close() throws IOException { - dataInputStream.close(); - } -} \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java deleted file mode 100644 index 68d21360..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java +++ /dev/null @@ -1,202 +0,0 @@ -package nu.marginalia.index.journal.reader.pointer; - -import nu.marginalia.index.journal.model.IndexJournalEntryTermData; -import org.jetbrains.annotations.NotNull; - -import java.io.IOException; -import java.util.Iterator; -import java.util.function.LongPredicate; - -/** - * This is something like a double iterator. The Index Journal consists of - * blocks of words and word-metadata for each document and document metadata. - *
- * - * Perhaps best conceptualized as something like - * - *
[doc1: word1 word2 word3 word4] [doc2: word1 word2 word3 ]
- * nextDocument() will move the pointer from doc1 to doc2;
- * nextRecord() will move the pointer from word1 to word2...
- */ -public interface IndexJournalPointer extends Iterable, AutoCloseable { - /** - * Advance to the next document in the journal, - * returning true if such a document exists. - * Resets the record index to before the first - * record (if it exists). - */ - boolean nextDocument(); - - /** - * Get the id associated with the current document - */ - long documentId(); - - /** - * Get the metadata associated with the current document - */ - long documentMeta(); - - /** - * Get the documentFeatures associated with the current record - */ - int documentFeatures(); - - int documentSize(); - - /** Concatenate a number of journal pointers */ - static IndexJournalPointer concatenate(IndexJournalPointer... pointers) { - if (pointers.length == 1) - return pointers[0]; - - return new JoiningJournalPointer(pointers); - } - - /** Add a filter on word metadata to the pointer */ - default IndexJournalPointer filterWordMeta(LongPredicate filter) { - return new FilteringJournalPointer(this, filter); - } - - void close() throws IOException; -} - -class JoiningJournalPointer implements IndexJournalPointer { - private final IndexJournalPointer[] pointers; - private int pIndex = 0; - - JoiningJournalPointer(IndexJournalPointer[] pointers) { - this.pointers = pointers; - } - - @Override - public boolean nextDocument() { - - while (pIndex < pointers.length) { - if (pointers[pIndex].nextDocument()) - return true; - else pIndex++; - } - - return false; - } - - @Override - public long documentId() { - return pointers[pIndex].documentId(); - } - - @Override - public long documentMeta() { - return pointers[pIndex].documentMeta(); - } - - - @Override - public int documentFeatures() { - return pointers[pIndex].documentFeatures(); - } - - @Override - public int documentSize() { - return pointers[pIndex].documentSize(); - } - - @NotNull - @Override - public Iterator iterator() { - return pointers[pIndex].iterator(); - } - - public void close() { - for (var p : pointers) { - try { - p.close(); - } catch (Exception e) { - e.printStackTrace(); - } - } - - } -} - -class FilteringJournalPointer implements IndexJournalPointer { - private final IndexJournalPointer base; - private final LongPredicate filter; - - FilteringJournalPointer(IndexJournalPointer base, LongPredicate filter) { - this.base = base; - this.filter = filter; - } - - @Override - public boolean nextDocument() { - while (base.nextDocument()) { - if (iterator().hasNext()) { - return true; - } - } - return false; - } - - @Override - public long documentId() { - return base.documentId(); - } - - @Override - public long documentMeta() { - return base.documentMeta(); - } - - @Override - public int documentFeatures() { - return base.documentFeatures(); - } - - - @Override - public int documentSize() { - return base.documentSize(); - } - - @NotNull - @Override - public Iterator iterator() { - - return new Iterator<>() { - private final Iterator baseIter = base.iterator(); - private IndexJournalEntryTermData value = null; - - @Override - public boolean hasNext() { - if (value != null) { - return true; - } - while (baseIter.hasNext()) { - value = baseIter.next(); - if (filter.test(value.metadata())) { - return true; - } - } - value = null; - return false; - } - - @Override - public IndexJournalEntryTermData next() { - if (hasNext()) { - var ret = value; - value = null; - return ret; - } else { - throw new IllegalStateException("No more elements"); - } - } - }; - } - - @Override - public void close() throws IOException { - base.close(); - } -} \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java deleted file mode 100644 index 916cf7a6..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.index.journal.writer; - -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; - -import java.io.IOException; - -/** Responsible for writing to the index journal. - *

- * @see IndexJournalWriterSingleFileImpl - * @see IndexJournalWriterPagingImpl - */ -public interface IndexJournalWriter extends AutoCloseable { - void close() throws IOException; - - int put(IndexJournalEntryHeader header, IndexJournalEntryData data); -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java deleted file mode 100644 index 919a8326..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java +++ /dev/null @@ -1,68 +0,0 @@ -package nu.marginalia.index.journal.writer; - -import lombok.SneakyThrows; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.IndexJournalFileNames; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Path; - -/** IndexJournalWriter implementation that creates a sequence of journal files, - * delegating to IndexJournalWriterSingleFileImpl to write the individual files. - * - */ -public class IndexJournalWriterPagingImpl implements IndexJournalWriter { - private final Path outputDir; - private int fileNumber = 0; - - /** The maximum size of a journal file, in uncompressed bytes. - * This should be safely below 2 GB, since we assume in the construction - * of the index that this is the case! The smaller these files are, the - * slower the index construction will be, but at the same time, if 2 GB - * is exceeded, the index construction will *quietly* fail. - * - * Flap flap, Icarus! - */ - private static final long sizeLimitBytes = 1_000_000_000; // 1 GB - - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private IndexJournalWriter currentWriter = null; - private long bytesWritten = 0; - - public IndexJournalWriterPagingImpl(Path outputDir) throws IOException { - this.outputDir = outputDir; - switchToNextWriter(); - - logger.info("Creating Journal Writer {}", outputDir); - } - - private void switchToNextWriter() throws IOException { - if (currentWriter != null) - currentWriter.close(); - - currentWriter = new IndexJournalWriterSingleFileImpl(IndexJournalFileNames.allocateName(outputDir, fileNumber++)); - } - - @Override - @SneakyThrows - public int put(IndexJournalEntryHeader header, IndexJournalEntryData data) - { - if (bytesWritten >= sizeLimitBytes) { - bytesWritten = 0; - switchToNextWriter(); - } - - int writtenNow = currentWriter.put(header, data); - bytesWritten += writtenNow; - - return writtenNow; - } - - public void close() throws IOException { - currentWriter.close(); - } -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java deleted file mode 100644 index f12c92f6..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java +++ /dev/null @@ -1,155 +0,0 @@ -package nu.marginalia.index.journal.writer; - -import com.github.luben.zstd.ZstdDirectBufferCompressingStream; -import lombok.SneakyThrows; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.sequence.CodedSequence; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.nio.file.attribute.PosixFilePermissions; - -/** IndexJournalWriter implementation that creates a single journal file */ -public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{ - - private static final int ZSTD_BUFFER_SIZE = 1<<16; - private static final int DATA_BUFFER_SIZE = 1<<16; - - private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE); - - private final ZstdDirectBufferCompressingStream compressingStream; - private final FileChannel fileChannel; - - private int numEntries = 0; - private boolean closed = false; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public IndexJournalWriterSingleFileImpl(Path outputFile) throws IOException { - - logger.info("Creating Journal Writer {}", outputFile); - - Files.deleteIfExists(outputFile); - Files.createFile(outputFile, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); - - fileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE, - StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING); - - writeHeaderPlaceholder(fileChannel); - - compressingStream = new ZstdDirectBufferCompressingStream(ByteBuffer.allocateDirect(ZSTD_BUFFER_SIZE), 3) { - protected ByteBuffer flushBuffer(ByteBuffer toFlush) throws IOException { - toFlush.flip(); - while (toFlush.hasRemaining()) { - fileChannel.write(toFlush); - } - toFlush.clear(); - - return toFlush; - } - }; - } - - /** The file has a non-compressed header at the beginning of the file. - * Write a placeholder first to reserve the bytes, and position the - * channel after the header - */ - private static void writeHeaderPlaceholder(FileChannel fileStream) throws IOException { - var buffer = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES); - - buffer.position(0); - buffer.limit(buffer.capacity()); - - while (buffer.hasRemaining()) - fileStream.write(buffer, buffer.position()); - - fileStream.position(IndexJournalReader.FILE_HEADER_SIZE_BYTES); - } - - @Override - @SneakyThrows - public int put(IndexJournalEntryHeader header, - IndexJournalEntryData data) - { - final long[] keywords = data.termIds(); - final long[] metadata = data.metadata(); - final CodedSequence[] positions = data.positions(); - - int entrySize = 0; - for (var position : positions) { - entrySize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + position.bufferSize(); - } - int totalSize = IndexJournalReader.DOCUMENT_HEADER_SIZE_BYTES + entrySize; - - if (entrySize > DATA_BUFFER_SIZE) { - // This should never happen, but if it does, we should log it and deal with it in a way that doesn't corrupt the file - // (64 KB is *a lot* of data for a single document, larger than the uncompressed HTML in like the 95%th percentile of web pages) - logger.error("Omitting entry: Record size {} exceeds maximum representable size of {}", entrySize, DATA_BUFFER_SIZE); - return 0; - } - - if (dataBuffer.remaining() < totalSize) { - dataBuffer.flip(); - compressingStream.compress(dataBuffer); - dataBuffer.clear(); - } - - if (dataBuffer.remaining() < totalSize) { - logger.error("Omitting entry: Record size {} exceeds buffer size of {}", totalSize, dataBuffer.capacity()); - return 0; - } - - assert entrySize < (1 << 16) : "Entry size must not exceed USHORT_MAX"; - - dataBuffer.putShort((short) entrySize); - dataBuffer.putShort((short) Math.clamp(header.documentSize(), 0, Short.MAX_VALUE)); - dataBuffer.putInt(header.documentFeatures()); - dataBuffer.putLong(header.combinedId()); - dataBuffer.putLong(header.documentMeta()); - - for (int i = 0; i < keywords.length; i++) { - dataBuffer.putLong(keywords[i]); - dataBuffer.putShort((short) metadata[i]); - dataBuffer.putShort((short) positions[i].bufferSize()); - dataBuffer.put(positions[i].buffer()); - } - - numEntries++; - - return totalSize; - } - - public void close() throws IOException { - if (closed) - return; - else - closed = true; - - dataBuffer.flip(); - compressingStream.compress(dataBuffer); - dataBuffer.clear(); - compressingStream.flush(); - compressingStream.close(); - - - // Finalize the file by writing a header in the beginning - ByteBuffer header = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES); - header.putLong(numEntries); - header.putLong(0); // reserved for future use - header.flip(); - - while (header.position() < header.limit()) { - fileChannel.write(header, header.position()); - } - - fileChannel.close(); - } -} diff --git a/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java b/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java deleted file mode 100644 index 5aa24ff7..00000000 --- a/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java +++ /dev/null @@ -1,448 +0,0 @@ -package nu.marginalia.index.journal; - -import it.unimi.dsi.fastutil.ints.IntList; -import it.unimi.dsi.fastutil.longs.LongArrayList; -import it.unimi.dsi.fastutil.longs.LongList; -import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.model.IndexJournalEntryTermData; -import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl; -import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; -import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.sequence.GammaCodedSequence; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class IndexJournalWriterTest { - Path tempFile; - Path tempFile2; - ByteBuffer workArea = ByteBuffer.allocate(1024); - - @BeforeEach - public void setUp() throws IOException { - tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); - tempFile2 = Files.createTempFile(getClass().getSimpleName(), ".dat"); - } - @AfterEach - public void tearDown() throws IOException { - Files.delete(tempFile); - Files.delete(tempFile2); - } - - private GammaCodedSequence gcs(int... values) { - return GammaCodedSequence.generate(workArea, values); - } - - static MurmurHash3_128 hasher = new MurmurHash3_128(); - static long wordId(String str) { - return hasher.hashKeyword(str); - } - - @Test - public void testSingleFile() { - try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { - // Write two documents with two terms each - writer.put(new IndexJournalEntryHeader(11, 22, 10, 33), - new IndexJournalEntryData( - new String[]{"word1", "word2"}, - new long[]{44, 55}, - new GammaCodedSequence[]{ - gcs(1, 3, 5), - gcs(2, 4, 6), - }) - ); - writer.put(new IndexJournalEntryHeader(12, 23, 11, 34), - new IndexJournalEntryData( - new String[]{"word1", "word2"}, - new long[]{45, 56}, - new GammaCodedSequence[]{ - gcs(2, 4, 6), - gcs(3, 5, 7), - }) - ); - } - catch (IOException ex) { - Assertions.fail(ex); - } - - // Read the journal back - - try { - var reader = new IndexJournalReaderSingleFile(tempFile); - - Iterator iter; - IndexJournalEntryTermData termData; - - try (var ptr = reader.newPointer()) { - - /** DOCUMENT 1 */ - assertTrue(ptr.nextDocument()); - assertEquals(11, ptr.documentId()); - assertEquals(22, ptr.documentFeatures()); - assertEquals(33, ptr.documentMeta()); - assertEquals(10, ptr.documentSize()); - - iter = ptr.iterator(); - - // Term 1 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word1"), termData.termId()); - assertEquals(44, termData.metadata()); - assertEquals(IntList.of(1, 3, 5), termData.positions().values()); - - // Term 2 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word2"), termData.termId()); - assertEquals(55, termData.metadata()); - assertEquals(IntList.of(2, 4, 6), termData.positions().values()); - - // No more terms - - assertFalse(iter.hasNext()); - - /** DOCUMENT 2 */ - assertTrue(ptr.nextDocument()); - assertEquals(12, ptr.documentId()); - assertEquals(23, ptr.documentFeatures()); - assertEquals(34, ptr.documentMeta()); - assertEquals(11, ptr.documentSize()); - - iter = ptr.iterator(); - // Term 1 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word1"), termData.termId()); - assertEquals(45, termData.metadata()); - assertEquals(IntList.of(2, 4, 6), termData.positions().values()); - - // Term 2 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word2"), termData.termId()); - assertEquals(56, termData.metadata()); - assertEquals(IntList.of(3, 5, 7), termData.positions().values()); - - // No more terms - assertFalse(iter.hasNext()); - - // No more documents - assertFalse(ptr.nextDocument()); - } - } - catch (IOException ex) { - Assertions.fail(ex); - } - } - - @Test - public void testMultiFile() { - try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { - writer.put(new IndexJournalEntryHeader(11, 22, 10, 33), - new IndexJournalEntryData( - new String[]{"word1", "word2"}, - new long[]{44, 55}, - new GammaCodedSequence[]{ - gcs(1, 3, 5), - gcs(2, 4, 6), - }) - ); - } - catch (IOException ex) { - Assertions.fail(ex); - } - - try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) { - writer.put(new IndexJournalEntryHeader(12, 23, 11, 34), - new IndexJournalEntryData( - new String[]{"word1", "word2"}, - new long[]{45, 56}, - new GammaCodedSequence[]{ - gcs(2, 4, 6), - gcs(3, 5, 7), - }) - ); - } - catch (IOException ex) { - Assertions.fail(ex); - } - - // Read the journal back - - try { - var reader = new IndexJournalReaderPagingImpl(List.of(tempFile, tempFile2)); - - Iterator iter; - IndexJournalEntryTermData termData; - - try (var ptr = reader.newPointer()) { - - /** DOCUMENT 1 */ - assertTrue(ptr.nextDocument()); - assertEquals(11, ptr.documentId()); - assertEquals(22, ptr.documentFeatures()); - assertEquals(33, ptr.documentMeta()); - assertEquals(10, ptr.documentSize()); - - iter = ptr.iterator(); - - // Term 1 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word1"), termData.termId()); - assertEquals(44, termData.metadata()); - assertEquals(IntList.of(1, 3, 5), termData.positions().values()); - - // Term 2 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word2"), termData.termId()); - assertEquals(55, termData.metadata()); - assertEquals(IntList.of(2, 4, 6), termData.positions().values()); - - // No more terms - - assertFalse(iter.hasNext()); - - /** DOCUMENT 2 */ - assertTrue(ptr.nextDocument()); - assertEquals(12, ptr.documentId()); - assertEquals(23, ptr.documentFeatures()); - assertEquals(34, ptr.documentMeta()); - assertEquals(11, ptr.documentSize()); - - iter = ptr.iterator(); - // Term 1 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word1"), termData.termId()); - assertEquals(45, termData.metadata()); - assertEquals(IntList.of(2, 4, 6), termData.positions().values()); - - // Term 2 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word2"), termData.termId()); - assertEquals(56, termData.metadata()); - assertEquals(IntList.of(3, 5, 7), termData.positions().values()); - - // No more terms - assertFalse(iter.hasNext()); - - // No more documents - assertFalse(ptr.nextDocument()); - } - } - catch (IOException ex) { - Assertions.fail(ex); - } - } - - @Test - public void testSingleFileIterTwice() { - try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { - // Write two documents with two terms each - writer.put(new IndexJournalEntryHeader(11, 22, 10, 33), - new IndexJournalEntryData( - new String[]{"word1", "word2"}, - new long[]{44, 55}, - new GammaCodedSequence[]{ - gcs(1, 3, 5), - gcs(2, 4, 6), - }) - ); - } - catch (IOException ex) { - Assertions.fail(ex); - } - - // Read the journal back - - try { - var reader = new IndexJournalReaderSingleFile(tempFile); - - Iterator iter; - IndexJournalEntryTermData termData; - - try (var ptr = reader.newPointer()) { - - /** DOCUMENT 1 */ - assertTrue(ptr.nextDocument()); - assertEquals(11, ptr.documentId()); - assertEquals(22, ptr.documentFeatures()); - assertEquals(10, ptr.documentSize()); - assertEquals(33, ptr.documentMeta()); - - iter = ptr.iterator(); - // Term 1 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word1"), termData.termId()); - assertEquals(44, termData.metadata()); - assertEquals(IntList.of(1, 3, 5), termData.positions().values()); - - // Ensure we can iterate again over the same document without persisting state or closing the pointer - - iter = ptr.iterator(); - // Term 1 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word1"), termData.termId()); - assertEquals(44, termData.metadata()); - assertEquals(IntList.of(1, 3, 5), termData.positions().values()); - } - } - catch (IOException ex) { - Assertions.fail(ex); - } - } - - @Test - public void testFiltered() { - try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { - // Write two documents with two terms each - writer.put(new IndexJournalEntryHeader(11, 22, 10, 33), - new IndexJournalEntryData( - new String[]{"word1", "word2"}, - new long[]{44, 55}, - new GammaCodedSequence[]{ - gcs(1, 3, 5), - gcs(2, 4, 6), - }) - ); - writer.put(new IndexJournalEntryHeader(12, 23, 11, 34), - new IndexJournalEntryData( - new String[]{"word1", "word2"}, - new long[]{45, 56}, - new GammaCodedSequence[]{ - gcs(2, 4, 6), - gcs(3, 5, 7), - } - )); - } - catch (IOException ex) { - Assertions.fail(ex); - } - - // Read the journal back - - try { - var reader = new IndexJournalReaderSingleFile(tempFile).filtering(meta -> meta == 45); - - Iterator iter; - IndexJournalEntryTermData termData; - - try (var ptr = reader.newPointer()) { - /** DOCUMENT 2 */ - assertTrue(ptr.nextDocument()); - assertEquals(12, ptr.documentId()); - assertEquals(23, ptr.documentFeatures()); - assertEquals(34, ptr.documentMeta()); - assertEquals(11, ptr.documentSize()); - - iter = ptr.iterator(); - // Term 1 - assertTrue(iter.hasNext()); - termData = iter.next(); - assertEquals(wordId("word1"), termData.termId()); - assertEquals(45, termData.metadata()); - assertEquals(IntList.of(2, 4, 6), termData.positions().values()); - - // No more terms - assertFalse(iter.hasNext()); - // No more documents - assertFalse(ptr.nextDocument()); - } - } - catch (IOException ex) { - Assertions.fail(ex); - } - } - - @Test - public void testIntegrationScenario() throws IOException { - Map wordMap = new HashMap<>(); - for (int i = 0; i < 512; i++) { - wordMap.put(hasher.hashKeyword(Integer.toString(i)), i); - } - try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { - for (int idc = 1; idc < 512; idc++) { - int id = idc; - int[] factors = IntStream - .rangeClosed(1, id) - .filter(v -> (id % v) == 0) - .toArray(); - - System.out.println("id:" + id + " factors: " + Arrays.toString(factors)); - - long fullId = UrlIdCodec.encodeId((32 - (id % 32)), id); - - var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); - - String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new); - long[] metadata = new long[factors.length]; - for (int i = 0; i < factors.length; i++) { - metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); - } - GammaCodedSequence[] positions = new GammaCodedSequence[factors.length]; - ByteBuffer wa = ByteBuffer.allocate(16); - for (int i = 0; i < factors.length; i++) { - positions[i] = GammaCodedSequence.generate(wa, i + 1); - } - - writer.put(header, new IndexJournalEntryData(keywords, metadata, positions)); - } - } - - try (var ptr = new IndexJournalReaderSingleFile(tempFile).newPointer()) { - while (ptr.nextDocument()) { - int ordinal = UrlIdCodec.getDocumentOrdinal(ptr.documentId()); - System.out.println(ordinal); - - var expectedFactors = - new LongArrayList(IntStream - .rangeClosed(1, ordinal) - .filter(v -> (ordinal % v) == 0) - .mapToObj(Integer::toString) - .mapToLong(hasher::hashKeyword) - .toArray()); - - LongList foundIds = new LongArrayList(); - - var iter = ptr.iterator(); - while (iter.hasNext()) { - var termData = iter.next(); - foundIds.add(termData.termId()); - } - - if (!expectedFactors.equals(foundIds)) { - System.out.println("Found: "); - System.out.println(foundIds.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(","))); - System.out.println("Expected: "); - System.out.println(expectedFactors.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(","))); - fail(); - } - assertEquals(expectedFactors, foundIds); - } - } - } - -} diff --git a/code/index/index-reverse/build.gradle b/code/index/index-reverse/build.gradle index 1ba91c19..eb83d6ce 100644 --- a/code/index/index-reverse/build.gradle +++ b/code/index/index-reverse/build.gradle @@ -16,11 +16,13 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:slop') implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:random-write-funnel') implementation project(':code:index:query') implementation project(':code:index:index-journal') implementation project(':code:common:model') + implementation project(':code:processes:converting-process:model') implementation project(':code:common:process') implementation project(':third-party:parquet-floor') @@ -34,5 +36,6 @@ dependencies { testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + testImplementation project(':code:libraries:test-helpers') } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/JournalReaderSource.java b/code/index/index-reverse/java/nu/marginalia/index/construction/JournalReaderSource.java deleted file mode 100644 index b565206d..00000000 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/JournalReaderSource.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.index.construction; - -import nu.marginalia.index.journal.reader.IndexJournalReader; - -import java.io.IOException; -import java.nio.file.Path; - -public interface JournalReaderSource { - IndexJournalReader construct(Path sourceFile) throws IOException; -} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java index db7d5604..c1ce1b5c 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java @@ -2,10 +2,10 @@ package nu.marginalia.index.construction.full; import lombok.SneakyThrows; import nu.marginalia.index.construction.DocIdRewriter; -import nu.marginalia.index.construction.JournalReaderSource; import nu.marginalia.index.construction.PositionsFileConstructor; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.index.journal.IndexJournalFileNames; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,20 +26,17 @@ public class FullIndexConstructor { private final Path outputFileDocs; private final Path outputFileWords; private final Path outputFilePositions; - private final JournalReaderSource readerSource; private final DocIdRewriter docIdRewriter; private final Path tmpDir; public FullIndexConstructor(Path outputFileDocs, Path outputFileWords, Path outputFilePositions, - JournalReaderSource readerSource, DocIdRewriter docIdRewriter, Path tmpDir) { this.outputFileDocs = outputFileDocs; this.outputFileWords = outputFileWords; this.outputFilePositions = outputFilePositions; - this.readerSource = readerSource; this.docIdRewriter = docIdRewriter; this.tmpDir = tmpDir; } @@ -48,8 +45,8 @@ public class FullIndexConstructor { String processName, Path sourceBaseDir) throws IOException { - var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir); - if (inputs.isEmpty()) { + var journal = IndexJournal.findJournal(sourceBaseDir); + if (journal.isEmpty()) { logger.error("No journal files in base dir {}", sourceBaseDir); return; } @@ -62,10 +59,12 @@ public class FullIndexConstructor { AtomicInteger progress = new AtomicInteger(0); - inputs - .parallelStream() + var journalVersions = journal.get().pages(); + + journalVersions + .stream() .map(in -> { - preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size()); + preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), journalVersions.size()); return construct(in, posConstructor); }) .reduce(this::merge) @@ -80,9 +79,9 @@ public class FullIndexConstructor { } @SneakyThrows - private FullPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) { + private FullPreindexReference construct(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor) { return FullPreindex - .constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir) + .constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir) .closeToReference(); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java index 063324d2..50f3a4bb 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java @@ -8,7 +8,7 @@ import nu.marginalia.index.construction.CountToOffsetTransformer; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.IndexSizeEstimator; import nu.marginalia.index.construction.PositionsFileConstructor; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.IndexJournalPage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,7 +43,7 @@ public class FullPreindex { /** Constructs a new preindex with the data associated with reader. The backing files * will have randomly assigned names. */ - public static FullPreindex constructPreindex(IndexJournalReader reader, + public static FullPreindex constructPreindex(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor, DocIdRewriter docIdRewriter, Path workDir) throws IOException @@ -52,13 +52,13 @@ public class FullPreindex { Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat"); Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); - var segments = FullPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); - var docs = FullPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments); + var segments = FullPreindexWordSegments.construct(journalInstance, segmentWordsFile, segmentCountsFile); + var docs = FullPreindexDocuments.construct(docsFile, workDir, journalInstance, docIdRewriter, positionsFileConstructor, segments); return new FullPreindex(segments, docs); } /** Close the associated memory mapped areas and return - * a dehydrated version of this object that can be re-opened + * a dehydrated page of this object that can be re-opened * later. */ public FullPreindexReference closeToReference() { diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java index bae7990a..9cadeb41 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java @@ -5,12 +5,13 @@ import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.PositionsFileConstructor; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.rwf.RandomFileAssembler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; @@ -39,13 +40,13 @@ public class FullPreindexDocuments { public static FullPreindexDocuments construct( Path docsFile, Path workDir, - IndexJournalReader reader, + IndexJournalPage journalInstance, DocIdRewriter docIdRewriter, PositionsFileConstructor positionsFileConstructor, FullPreindexWordSegments segments) throws IOException { FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor; - createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); + createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter); LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile); sortDocsFile(docsFileMap, segments); @@ -68,28 +69,42 @@ public class FullPreindexDocuments { private static void createUnsortedDocsFile(Path docsFile, Path workDir, - IndexJournalReader reader, + IndexJournalPage journalInstance, FullPreindexWordSegments segments, DocIdRewriter docIdRewriter) throws IOException { long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); + final ByteBuffer tempBuffer = ByteBuffer.allocate(65536); + try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); - var pointer = reader.newPointer()) + var docIds = journalInstance.openCombinedId(); + var termCounts = journalInstance.openTermCounts(); + var termIds = journalInstance.openTermIds(); + var termMeta = journalInstance.openTermMetadata(); + var positions = journalInstance.openTermPositions()) { var offsetMap = segments.asMap(RECORD_SIZE_LONGS); offsetMap.defaultReturnValue(0); - while (pointer.nextDocument()) { - long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId()); - for (var termData : pointer) { - long termId = termData.termId(); + while (termCounts.hasRemaining()) { + long docId = docIds.get(); + long rankEncodedId = docIdRewriter.rewriteDocId(docId); + + long termCount = termCounts.get(); + + for (int termIdx = 0; termIdx < termCount; termIdx++) { + long termId = termIds.get(); + byte meta = termMeta.get(); + + // Read positions + tempBuffer.clear(); + positions.getData(tempBuffer); + tempBuffer.flip(); long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); - - // write position data to the positions file and get the offset - long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer()); + long encodedPosOffset = positionsFileConstructor.add(meta, tempBuffer); assembly.put(offset + 0, rankEncodedId); assembly.put(offset + 1, encodedPosOffset); diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java index 9045b0c7..73bd03b2 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java @@ -5,7 +5,7 @@ import nu.marginalia.array.LongArrayFactory; import java.io.IOException; import java.nio.file.Path; -/** This is a dehydrated version of a FullPreIndex, that only +/** This is a dehydrated page of a FullPreIndex, that only * keeps references to its location on disk but does not hold associated * memory maps. */ diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java index eb744616..120b1326 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java @@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import it.unimi.dsi.fastutil.longs.LongIterator; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.IndexJournalPage; import java.io.IOException; import java.nio.file.Files; @@ -51,14 +51,20 @@ public class FullPreindexWordSegments { return ret; } - public static FullPreindexWordSegments construct(IndexJournalReader reader, + public static FullPreindexWordSegments construct(IndexJournalPage journalInstance, Path wordIdsFile, Path countsFile) throws IOException { Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); countsMap.defaultReturnValue(0); - reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1)); + + try (var termIds = journalInstance.openTermIds()) { + while (termIds.hasRemaining()) { + countsMap.addTo(termIds.get(), 1); + } + } + LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size()); LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size()); diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java index 93616e88..f382f91b 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java @@ -2,8 +2,8 @@ package nu.marginalia.index.construction.prio; import lombok.SneakyThrows; import nu.marginalia.index.construction.DocIdRewriter; -import nu.marginalia.index.construction.JournalReaderSource; -import nu.marginalia.index.journal.IndexJournalFileNames; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.process.control.ProcessHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,18 +24,15 @@ public class PrioIndexConstructor { private final Path outputFileDocs; private final Path outputFileWords; - private final JournalReaderSource readerSource; private final DocIdRewriter docIdRewriter; private final Path tmpDir; public PrioIndexConstructor(Path outputFileDocs, Path outputFileWords, - JournalReaderSource readerSource, DocIdRewriter docIdRewriter, Path tmpDir) { this.outputFileDocs = outputFileDocs; this.outputFileWords = outputFileWords; - this.readerSource = readerSource; this.docIdRewriter = docIdRewriter; this.tmpDir = tmpDir; } @@ -44,8 +41,8 @@ public class PrioIndexConstructor { String processName, Path sourceBaseDir) throws IOException { - var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir); - if (inputs.isEmpty()) { + var journal = IndexJournal.findJournal(sourceBaseDir); + if (journal.isEmpty()) { logger.error("No journal files in base dir {}", sourceBaseDir); return; } @@ -57,10 +54,12 @@ public class PrioIndexConstructor { AtomicInteger progress = new AtomicInteger(0); - inputs - .parallelStream() + var journalVersions = journal.get().pages(); + + journalVersions + .stream() .map(in -> { - preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size()); + preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), journalVersions.size()); return construct(in); }) .reduce(this::merge) @@ -75,9 +74,9 @@ public class PrioIndexConstructor { } @SneakyThrows - private PrioPreindexReference construct(Path input) { + private PrioPreindexReference construct(IndexJournalPage journalInstance) { return PrioPreindex - .constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir) + .constructPreindex(journalInstance, docIdRewriter, tmpDir) .closeToReference(); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java index a9ac2337..ee1ab3ac 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java @@ -6,7 +6,7 @@ import nu.marginalia.btree.BTreeWriter; import nu.marginalia.index.ReverseIndexParameters; import nu.marginalia.index.construction.CountToOffsetTransformer; import nu.marginalia.index.construction.DocIdRewriter; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.IndexJournalPage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,7 +16,8 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; -import static nu.marginalia.array.algo.TwoArrayOperations.*; +import static nu.marginalia.array.algo.TwoArrayOperations.countDistinctElements; +import static nu.marginalia.array.algo.TwoArrayOperations.mergeArrays; /** Contains the data that would go into a reverse index, * that is, a mapping from words to documents, minus the actual @@ -41,7 +42,7 @@ public class PrioPreindex { /** Constructs a new preindex with the data associated with reader. The backing files * will have randomly assigned names. */ - public static PrioPreindex constructPreindex(IndexJournalReader reader, + public static PrioPreindex constructPreindex(IndexJournalPage indexJournalPage, DocIdRewriter docIdRewriter, Path workDir) throws IOException { @@ -49,13 +50,13 @@ public class PrioPreindex { Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat"); Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); - var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); - var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments); + var segments = PrioPreindexWordSegments.construct(indexJournalPage, segmentWordsFile, segmentCountsFile); + var docs = PrioPreindexDocuments.construct(docsFile, workDir, indexJournalPage, docIdRewriter, segments); return new PrioPreindex(segments, docs); } /** Close the associated memory mapped areas and return - * a dehydrated version of this object that can be re-opened + * a dehydrated page of this object that can be re-opened * later. */ public PrioPreindexReference closeToReference() { diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java index 186d0d65..bdda5a4f 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java @@ -4,7 +4,7 @@ import lombok.SneakyThrows; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.construction.DocIdRewriter; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.rwf.RandomFileAssembler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,11 +37,11 @@ public class PrioPreindexDocuments { public static PrioPreindexDocuments construct( Path docsFile, Path workDir, - IndexJournalReader reader, + IndexJournalPage journalInstance, DocIdRewriter docIdRewriter, PrioPreindexWordSegments segments) throws IOException { - createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); + createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter); LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile); sortDocsFile(docsFileMap, segments); @@ -54,37 +54,41 @@ public class PrioPreindexDocuments { } - public LongArray slice(long start, long end) { - return documents.range(start, end); - } - public long size() { return documents.size(); } private static void createUnsortedDocsFile(Path docsFile, Path workDir, - IndexJournalReader reader, + IndexJournalPage journalInstance, PrioPreindexWordSegments segments, DocIdRewriter docIdRewriter) throws IOException { long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); - var pointer = reader.newPointer()) + var docIds = journalInstance.openCombinedId(); + var termIdsCounts = journalInstance.openTermCounts(); + var termIds = journalInstance.openTermIds(); + var termMeta = journalInstance.openTermMetadata()) { var offsetMap = segments.asMap(RECORD_SIZE_LONGS); offsetMap.defaultReturnValue(0); - while (pointer.nextDocument()) { - long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId()); - for (var termData : pointer) { - long termId = termData.termId(); + while (docIds.hasRemaining()) { + long docId = docIds.get(); + long rankEncodedId = docIdRewriter.rewriteDocId(docId); - long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); + long termCount = termIdsCounts.get(); + for (int termIdx = 0; termIdx < termCount; termIdx++) { + long termId = termIds.get(); + byte meta = termMeta.get(); - assembly.put(offset, rankEncodedId); + if (meta != 0) { + long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); + assembly.put(offset, rankEncodedId); + } } } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java index 10b590dd..f2ccd8df 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java @@ -5,7 +5,7 @@ import nu.marginalia.array.LongArrayFactory; import java.io.IOException; import java.nio.file.Path; -/** This is a dehydrated version of a PrioPreIndex, that only +/** This is a dehydrated page of a PrioPreIndex, that only * keeps references to its location on disk but does not hold associated * memory maps. */ diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java index 512f10ff..c2fe2e96 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java @@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import it.unimi.dsi.fastutil.longs.LongIterator; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.IndexJournalPage; import java.io.IOException; import java.nio.file.Files; @@ -51,14 +51,26 @@ public class PrioPreindexWordSegments { return ret; } - public static PrioPreindexWordSegments construct(IndexJournalReader reader, + public static PrioPreindexWordSegments construct(IndexJournalPage journalInstance, Path wordIdsFile, Path countsFile) throws IOException { Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); countsMap.defaultReturnValue(0); - reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1)); + + try (var termIds = journalInstance.openTermIds(); + var termMetas = journalInstance.openTermMetadata()) { + + while (termIds.hasRemaining()) { + long data = termIds.get(); + byte meta = termMetas.get(); + + if (meta != 0) { + countsMap.addTo(data, 1); + } + } + } LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size()); LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size()); diff --git a/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java index 6cf4349c..d77d2133 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java @@ -2,6 +2,7 @@ package nu.marginalia.index; import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.array.page.LongQueryBuffer; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.full.FullPreindex; @@ -45,6 +46,11 @@ class FullReverseIndexReaderTest { Files.delete(tempDir); } + MurmurHash3_128 hash = new MurmurHash3_128(); + long termId(String keyword) { + return hash.hashKeyword(keyword); + } + @Test public void testSimple() throws IOException { @@ -52,18 +58,19 @@ class FullReverseIndexReaderTest { new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5)) ); - assertEquals(1, indexReader.numDocuments(50)); + assertEquals(1, indexReader.numDocuments(termId("50"))); - var positions = indexReader.getTermData(Arena.global(), 50, new long[] { 100 }); + var positions = indexReader.getTermData(Arena.global(), termId("50"), new long[] { 100 }); assertEquals(1, positions.length); assertNotNull(positions[0]); assertEquals((byte) 51, positions[0].flags()); assertEquals(IntList.of(1, 3, 5), positions[0].positions().values()); - assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); + assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50"))); } + @Test public void test2x2() throws IOException { @@ -72,13 +79,13 @@ class FullReverseIndexReaderTest { new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54)) ); - assertEquals(1, indexReader.numDocuments(50)); - assertEquals(2, indexReader.numDocuments(51)); - assertEquals(1, indexReader.numDocuments(52)); + assertEquals(1, indexReader.numDocuments(termId("50"))); + assertEquals(2, indexReader.numDocuments(termId("51"))); + assertEquals(1, indexReader.numDocuments(termId("52"))); - assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); - assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51)); - assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52)); + assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50"))); + assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, termId("51"))); + assertArrayEquals(new long[] { 101 }, readEntries(indexReader, termId("52"))); } diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java index a5c87f0f..8f6e6a14 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java @@ -1,5 +1,6 @@ package nu.marginalia.index.construction.full; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.PositionsFileConstructor; import org.junit.jupiter.api.AfterEach; @@ -53,33 +54,9 @@ class FullPreindexDocsTest { Files.delete(tempDir); } - @Test - public void testDocs() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33) - ); - - var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments); - - List expected = List.of( - new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }), - new TestSegmentData(10, 2, 4, new long[] { -0xF00BA3L, 0 }), - new TestSegmentData(33, 4, 6, new long[] { -0xF00BA3L, 0 }), - new TestSegmentData(40, 6, 8, new long[] { -0xF00BA3L, 0 }) - ); - - List actual = new ArrayList<>(); - - var iter = segments.iterator(2); - while (iter.next()) { - long[] data = new long[(int) (iter.endOffset - iter.startOffset)]; - docs.slice(iter.startOffset, iter.endOffset).get(0, data); - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset, - data)); - } - - assertEquals(expected, actual); + MurmurHash3_128 hash = new MurmurHash3_128(); + long termId(String keyword) { + return hash.hashKeyword(keyword); } @Test @@ -94,7 +71,7 @@ class FullPreindexDocsTest { segments); List expected = List.of( - new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 }) + new TestSegmentData(termId("4"), 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 }) ); List actual = new ArrayList<>(); diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java index 411f2cdc..253e0d52 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java @@ -3,6 +3,7 @@ package nu.marginalia.index.construction.full; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.btree.model.BTreeHeader; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.PositionsFileConstructor; import org.junit.jupiter.api.AfterEach; @@ -12,9 +13,11 @@ import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.List; -import static nu.marginalia.index.construction.full.TestJournalFactory.*; +import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta; +import static nu.marginalia.index.construction.full.TestJournalFactory.wm; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -51,6 +54,11 @@ class FullPreindexFinalizeTest { Files.delete(tempDir); } + MurmurHash3_128 hash = new MurmurHash3_128(); + long termId(String keyword) { + return hash.hashKeyword(keyword); + } + @Test public void testFinalizeSimple() throws IOException { var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51))); @@ -81,7 +89,7 @@ class FullPreindexFinalizeTest { assertEquals(1, wordsHeader.numEntries()); assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); - assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); + assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs())); } @@ -121,8 +129,8 @@ class FullPreindexFinalizeTest { long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1); long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3); - assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); - assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); + assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs())); + assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs())); BTreeHeader docsHeader; diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexMergeTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexMergeTest.java deleted file mode 100644 index 85796e41..00000000 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexMergeTest.java +++ /dev/null @@ -1,435 +0,0 @@ - -package nu.marginalia.index.construction.full; - -import nu.marginalia.index.construction.DocIdRewriter; -import nu.marginalia.index.construction.PositionsFileConstructor; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.*; - -import static nu.marginalia.index.construction.full.TestJournalFactory.*; -import static org.junit.jupiter.api.Assertions.assertEquals; - -class FullPreindexMergeTest { - TestJournalFactory journalFactory; - Path countsFile; - Path wordsIdFile; - Path docsFile; - Path tempDir; - Path positionsFile; - - @BeforeEach - public void setUp() throws IOException { - journalFactory = new TestJournalFactory(); - - positionsFile = Files.createTempFile("positions", ".dat"); - countsFile = Files.createTempFile("counts", ".dat"); - wordsIdFile = Files.createTempFile("words", ".dat"); - docsFile = Files.createTempFile("docs", ".dat"); - tempDir = Files.createTempDirectory("sort"); - } - - @AfterEach - public void tearDown() throws IOException { - journalFactory.clear(); - - Files.deleteIfExists(countsFile); - Files.deleteIfExists(wordsIdFile); - List contents = new ArrayList<>(); - Files.list(tempDir).forEach(contents::add); - for (var tempFile : contents) { - Files.delete(tempFile); - } - Files.delete(tempDir); - } - - public FullPreindex runMergeScenario( - List leftData, - List rightData - ) throws IOException { - var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new)); - var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new)); - - var left = FullPreindex.constructPreindex(reader1, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir); - var right = FullPreindex.constructPreindex(reader2, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir); - return FullPreindex.merge(tempDir, left, right); - } - - private List getData(FullPreindex merged) { - var iter = merged.segments.iterator(2); - List actual = new ArrayList<>(); - while (iter.next()) { - long[] data = new long[(int) (iter.endOffset - iter.startOffset)]; - merged.documents.slice(iter.startOffset, iter.endOffset).get(0, data); - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset, - data)); - } - return actual; - } - - @Test - @Disabled - public void testDocsMergeSingleNoOverlap() throws IOException { - - IdSequence docIds = new IdSequence(); - IdSequence docMetas = new IdSequence(); - IdSequence wordMetas = new IdSequence(); - IdSequence wordIds = new IdSequence(); - - var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique()))); - var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique()))); - - var merged = runMergeScenario( - leftSequence, - rightSequence - ); - - var actual = getData(merged); - - var expected = simulateMerge(leftSequence, rightSequence); - - System.out.println(actual); - assertEquals(expected, actual); - } - - @Test - @Disabled - public void testDocsMergeSingleOnlyOverlap() throws IOException { - - IdSequence docIds = new IdSequence(); - IdSequence docMetas = new IdSequence(); - IdSequence wordMetas = new IdSequence(); - IdSequence wordIds = new IdSequence(); - - var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique()))); - var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.alreadySeenSameSequence(), wordMetas.nextUnique()))); - - var merged = runMergeScenario( - leftSequence, - rightSequence - ); - - var actual = getData(merged); - - var expected = simulateMerge(leftSequence, rightSequence); - - System.out.println(actual); - assertEquals(expected, actual); - } - - @Test - @Disabled - public void testDocsMergeSingleOnlyOverlap2() throws IOException { - - long wid1 = 1; - long wid2 = 2; - IdSequence docIds = new IdSequence(); - IdSequence docMetas = new IdSequence(); - IdSequence wordMetas = new IdSequence(); - - var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), - wm(wid1, wordMetas.nextUnique()), - wm(wid2, wordMetas.nextUnique()) - )); - var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), - wm(wid1, wordMetas.nextUnique()), - wm(wid2, wordMetas.nextUnique()) - )); - - var merged = runMergeScenario( - leftSequence, - rightSequence - ); - - var actual = getData(merged); - - var expected = simulateMerge(leftSequence, rightSequence); - - System.out.println(actual); - assertEquals(expected, actual); - } - - @Test - @Disabled - public void testBadCase1() throws IOException { - long wordId = 0xF00F00BA3L; - - List leftSequence = List.of(new EntryDataWithWordMeta(40, 50, - wm(wordId, 5)) - ); - List rightSequence = List.of(new EntryDataWithWordMeta(41, 51, - wm(wordId, 3), - wm(wordId, 4)) - ); - - var mergedLR = runMergeScenario( - leftSequence, - rightSequence - ); - var mergedRL = runMergeScenario( - rightSequence, - leftSequence - ); - - var actualLR = getData(mergedLR); - var actualRL = getData(mergedRL); - - var expected = simulateMerge(leftSequence, rightSequence); - - assertEquals(actualLR, actualRL); - - if (!expected.equals(actualLR)) { - System.out.println("*fail*"); - System.out.println(leftSequence); - System.out.println(rightSequence); - } - else { - System.out.println("*pass*"); - } - - assertEquals(expected, actualLR); - - } - - @Test - @Disabled - public void testBadCase2() throws IOException { - long wordId = 100; - - List leftSequence = List.of( - new EntryDataWithWordMeta(1, 50, wm(wordId, 5)), - new EntryDataWithWordMeta(2, 50, wm(wordId, 5)) - - ); - List rightSequence = List.of( - new EntryDataWithWordMeta(3, 50, wm(wordId, 5)) - ); - - var mergedLR = runMergeScenario( - leftSequence, - rightSequence - ); - var mergedRL = runMergeScenario( - rightSequence, - leftSequence - ); - - var actualLR = getData(mergedLR); - var actualRL = getData(mergedRL); - - var expected = simulateMerge(leftSequence, rightSequence); - - assertEquals(actualLR, actualRL); - - if (!expected.equals(actualLR)) { - System.out.println("*fail*"); - System.out.println(leftSequence); - System.out.println(rightSequence); - } - else { - System.out.println("*pass*"); - } - - assertEquals(expected, actualLR); - - } - - @Test - @Disabled - public void testFuzz() throws IOException { - Random r = new Random(); - int maxDocs = 150; - int maxWords = 160; - int nIters = 1000; - - for (int i = 0; i < nIters; i++) { - int nLeft = 1 + r.nextInt(maxDocs); - int nRight = 1 + r.nextInt(maxDocs); - - IdSequence docIdsLeft = new IdSequence(); - IdSequence docIdsRight = new IdSequence(); - IdSequence docMetas = new IdSequence(); - IdSequence wordMetas = new IdSequence(); - IdSequence wordIds = new IdSequence(); - - List leftSequence = new ArrayList<>(nLeft); - for (int j = 0; j < nLeft; j++) { - WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)]; - Arrays.setAll(words, idx -> { - long wordId = wordIds.seenWithP(1.0); - long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId); - return wm(wordId, wordMeta); - }); - - long docId = docIdsLeft.nextUnique(); - long docMeta = docMetas.nextUniqueAssociatedWithKey(docId); - leftSequence.add(new EntryDataWithWordMeta(docId, docMeta, words)); - } - - List rightSequence = new ArrayList<>(nLeft); - for (int j = 0; j < nRight; j++) { - WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)]; - Arrays.setAll(words, idx -> { - long wordId = wordIds.seenWithP(1.0); - long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId); - return wm(wordId, wordMeta); - }); - - long docId = docIdsRight.seenWithP(docIdsLeft, 0.1); - long docMeta = docMetas.nextUniqueAssociatedWithKey(docId); - rightSequence.add(new EntryDataWithWordMeta(docId, docMeta, words)); - } - - var mergedLR = runMergeScenario( - leftSequence, - rightSequence - ); - var mergedRL = runMergeScenario( - rightSequence, - leftSequence - ); - - var actualLR = getData(mergedLR); - var actualRL = getData(mergedRL); - - var expected = simulateMerge(leftSequence, rightSequence); - - assertEquals(actualLR, actualRL); - - if (!expected.equals(actualLR)) { - System.out.println("*fail*"); - System.out.println(leftSequence); - System.out.println(rightSequence); - } - else { - System.out.println("*pass*"); - } - - assertEquals(expected, actualLR); - - } - } - - - public List simulateMerge( - Collection leftInputs, - Collection rightInputs - ) { - TreeMap> wordToDocs = new TreeMap<>(); - - for (var entry : leftInputs) { - for (var wm : entry.wordIds()) { - wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add( - new DocWithMeta(entry.docId(), wm.meta()) - ); - } - } - for (var entry : rightInputs) { - for (var wm : entry.wordIds()) { - wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add( - new DocWithMeta(entry.docId(), wm.meta()) - ); - } - } - - List ret = new ArrayList<>(); - int[] start = new int[1]; - wordToDocs.forEach((wordId, docsList) -> { - docsList.sort(Comparator.naturalOrder()); - var iter = docsList.iterator(); - DocWithMeta prevVal = null; - DocWithMeta currentVal; - while (iter.hasNext()) { - currentVal = iter.next(); - if (prevVal != null) { - if (currentVal.docId == prevVal.docId) { - iter.remove(); - } - } - prevVal = currentVal; - - } - long[] data = new long[docsList.size()*2]; - for (int i = 0; i < docsList.size(); i++) { - data[2*i] = docsList.get(i).docId; - data[2*i + 1] = docsList.get(i).meta; - } - ret.add(new TestSegmentData(wordId, start[0], start[0] + data.length, data)); - - start[0] += data.length; - }); - return ret; - } - - - record DocWithMeta(long docId, long meta) implements Comparable { - - @Override - public int compareTo(DocWithMeta o) { - return Long.compare(docId, o.docId); - } - } - - class IdSequence { - Set seen = new HashSet<>(); - Map associatedValues = new HashMap<>(); - private Random random = new Random(); - - /** Return alreadySeen() with probability p, - * else nextUnique() - */ - public long seenWithP(double p) { - if (isEmpty() || random.nextDouble() > p) - return nextUnique(); - - return alreadySeenSameSequence(); - } - - public long seenWithP(IdSequence other, double p) { - if (isEmpty() || random.nextDouble() > p) - return nextUnique(); - - return alreadySeenOtherSequence(other); - } - - public long nextUnique() { - for (;;) { - long val = random.nextLong(); - if (seen.add(val)) { - return val; - } - } - } - - public long nextUniqueAssociatedWithKey(long key) { - return associatedValues.computeIfAbsent(key, k -> nextUnique()); - } - - public long alreadySeenSameSequence() { - long[] values = seen.stream().mapToLong(Long::longValue).toArray(); - int idx = random.nextInt(0, values.length); - return values[idx]; - } - - public long alreadySeenOtherSequence(IdSequence other) { - List values = new ArrayList<>(other.seen); - Collections.shuffle(values); - for (Long maybe : values) { - if (seen.add(maybe)) - return maybe; - } - return nextUnique(); - } - - public boolean isEmpty() { - return seen.isEmpty(); - } - } - -} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexWordSegmentsTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexWordSegmentsTest.java deleted file mode 100644 index 72c13207..00000000 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexWordSegmentsTest.java +++ /dev/null @@ -1,231 +0,0 @@ -package nu.marginalia.index.construction.full; - -import nu.marginalia.array.LongArray; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -import static nu.marginalia.index.construction.full.TestJournalFactory.*; -import static org.junit.jupiter.api.Assertions.*; - -class FullPreindexWordSegmentsTest { - Path countsFile; - Path wordsIdFile; - Path docsFile; - Path tempDir; - - TestJournalFactory journalFactory; - - @BeforeEach - public void setUp() throws IOException { - journalFactory = new TestJournalFactory(); - - countsFile = Files.createTempFile("counts", ".dat"); - wordsIdFile = Files.createTempFile("words", ".dat"); - docsFile = Files.createTempFile("docs", ".dat"); - tempDir = Files.createTempDirectory("sort"); - } - - @AfterEach - public void tearDown() throws IOException { - journalFactory.clear(); - - Files.deleteIfExists(countsFile); - Files.deleteIfExists(wordsIdFile); - List contents = new ArrayList<>(); - Files.list(tempDir).forEach(contents::add); - for (var tempFile : contents) { - Files.delete(tempFile); - } - Files.delete(tempDir); - } - @Test - public void testWordSegmentsLongWordId() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 1L<<33) - ); - - var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var iter = segments.iterator(1); - - List expected = List.of( - new TestSegmentData(1L<<33, 0, 1) - ); - - List actual = new ArrayList<>(); - - while (iter.next()) { - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); - } - - assertEquals(expected, actual); - } - @Test - public void testWordSegmentsRepeatedWordId() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 5, 5) - ); - - var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var iter = segments.iterator(1); - - List expected = List.of( - new TestSegmentData(5, 0, 2) - ); - - List actual = new ArrayList<>(); - - while (iter.next()) { - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); - } - - assertEquals(expected, actual); - } - - @Test - public void testWordSegments1() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33) - ); - - var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var iter = segments.iterator(1); - - List expected = List.of( - new TestSegmentData(-100, 0, 1), - new TestSegmentData(10, 1, 2), - new TestSegmentData(33, 2, 3), - new TestSegmentData(40, 3, 4) - ); - - List actual = new ArrayList<>(); - - while (iter.next()) { - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); - } - - assertEquals(expected, actual); - } - - @Test - public void testWordSegments2() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33), - new EntryData(0xF00BA4L, 0, 15, 30, -100, 33) - ); - - var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var iter = segments.iterator(1); - - List expected = List.of( - new TestSegmentData(-100, 0, 2), - new TestSegmentData(10, 2, 3), - new TestSegmentData(15, 3, 4), - new TestSegmentData(30, 4, 5), - new TestSegmentData(33, 5, 7), - new TestSegmentData(40, 7, 8) - ); - - List actual = new ArrayList<>(); - - while (iter.next()) { - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); - } - - assertEquals(expected, actual); - } - - - @Test - public void testWordSegments_ReadIterator() { - LongArray wordsArray = LongArray.allocate(4); - LongArray countsArray = LongArray.allocate(4); - wordsArray.set(0, -1, -2, -3, -4); - countsArray.set(0, 2, 1, 3, 5); - var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null); - - var ritr = segments.iterator(1); - assertTrue(ritr.hasMorePositions()); - assertTrue(ritr.next()); - assertTrue(ritr.isPositionBeforeEnd()); - assertEquals(-1, ritr.wordId); - assertEquals(0, ritr.idx()); - assertEquals(0, ritr.startOffset); - assertEquals(2, ritr.endOffset); - - assertTrue(ritr.hasMorePositions()); - assertTrue(ritr.next()); - assertTrue(ritr.isPositionBeforeEnd()); - assertEquals(-2, ritr.wordId); - assertEquals(1, ritr.idx()); - assertEquals(2, ritr.startOffset); - assertEquals(3, ritr.endOffset); - - assertTrue(ritr.hasMorePositions()); - assertTrue(ritr.next()); - assertTrue(ritr.isPositionBeforeEnd()); - assertEquals(-3, ritr.wordId); - assertEquals(2, ritr.idx()); - assertEquals(3, ritr.startOffset); - assertEquals(6, ritr.endOffset); - - assertTrue(ritr.hasMorePositions()); - assertTrue(ritr.next()); - assertTrue(ritr.isPositionBeforeEnd()); - assertEquals(-4, ritr.wordId); - assertEquals(3, ritr.idx()); - assertEquals(6, ritr.startOffset); - assertEquals(11, ritr.endOffset); - - assertFalse(ritr.hasMorePositions()); - assertFalse(ritr.next()); - assertFalse(ritr.isPositionBeforeEnd()); - - assertEquals(Long.MIN_VALUE, ritr.wordId); - } - - - @Test - public void testWordSegments_ConstructionIterator() { - LongArray wordsArray = LongArray.allocate(4); - LongArray countsArray = LongArray.allocate(4); - wordsArray.set(0, -1, -2, -3, -4); - var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null); - - var citr = segments.constructionIterator(1); - assertEquals(-1, citr.wordId); - assertEquals(0, citr.idx()); - assertTrue(citr.canPutMore()); - assertTrue(citr.putNext(1)); - assertEquals(1, countsArray.get(0)); - - assertEquals(-2, citr.wordId); - assertEquals(1, citr.idx()); - assertTrue(citr.canPutMore()); - assertTrue(citr.putNext(2)); - assertEquals(2, countsArray.get(1)); - - assertEquals(-3, citr.wordId); - assertEquals(2, citr.idx()); - assertTrue(citr.canPutMore()); - assertTrue(citr.putNext(3)); - assertEquals(3, countsArray.get(2)); - - assertEquals(-4, citr.wordId); - assertEquals(3, citr.idx()); - assertTrue(citr.canPutMore()); - assertFalse(citr.putNext(4)); - assertEquals(4, countsArray.get(3)); - - assertEquals(4, citr.idx()); - assertFalse(citr.canPutMore()); - assertEquals(Long.MIN_VALUE, citr.wordId); - } - -} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java index f34dcd9c..80c0970c 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java @@ -1,17 +1,15 @@ package nu.marginalia.index.construction.full; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; -import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; +import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.index.journal.IndexJournalSlopWriter; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.test.TestUtil; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; @@ -22,17 +20,13 @@ public class TestJournalFactory { public TestJournalFactory() throws IOException {} public void clear() throws IOException { - List toDelete = new ArrayList<>(); - try (var dirStream = Files.list(tempDir)) { - dirStream.forEach(toDelete::add); - } - for (var tempFile : toDelete) { - Files.delete(tempFile); - } - Files.delete(tempDir); + TestUtil.clearTempDir(tempDir); } - public record EntryData(long docId, long docMeta, long... wordIds) { + public record EntryData(long docId, long docMeta, String... wordIds) { + public EntryData(long docId, long docMeta, long... wordIds) { + this(docId, docMeta, Arrays.stream(wordIds).mapToObj(String::valueOf).toArray(String[]::new)); + } @Override public String toString() { return "EntryData{" + @@ -52,19 +46,23 @@ public class TestJournalFactory { '}'; } } - public record WordWithMeta(long wordId, long meta, GammaCodedSequence gcs) {} - - public static WordWithMeta wm(long wordId, long meta, int... positions) { - return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions)); + public record WordWithMeta(String wordId, byte meta, GammaCodedSequence gcs) { + public WordWithMeta(long wordId, byte meta, GammaCodedSequence gcs) { + this(String.valueOf(wordId), meta, gcs); + } } - public IndexJournalReader createReader(EntryData... entries) throws IOException { - Path jf = Files.createTempFile(tempDir, "journal", ".dat"); + public static WordWithMeta wm(long wordId, int meta, int... positions) { + return new WordWithMeta(wordId, (byte) meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions)); + } - var writer = new IndexJournalWriterSingleFileImpl(jf); + public IndexJournalPage createReader(EntryData... entries) throws IOException { + Path ji = Files.createTempDirectory(tempDir, "journal"); + + var writer = new IndexJournalSlopWriter(ji, 0); for (var entry : entries) { - long[] termIds = new long[entry.wordIds.length]; - long[] meta = new long[entry.wordIds.length]; + String[] termIds = new String[entry.wordIds.length]; + byte[] meta = new byte[entry.wordIds.length]; GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length]; for (int i = 0; i < entry.wordIds.length; i++) { @@ -73,22 +71,35 @@ public class TestJournalFactory { positions[i] = new GammaCodedSequence(new byte[1]); } - writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta), - new IndexJournalEntryData(termIds, meta, positions)); + writer.put( + entry.docId, + new SlopDocumentRecord.KeywordsProjection( + "test", + -1, + 0, + entry.docMeta, + 15, + Arrays.asList(termIds), + meta, + Arrays.asList(positions), + new byte[0], + List.of() + ) + ); } writer.close(); - var ret = new IndexJournalReaderSingleFile(jf); - return ret; + + return new IndexJournalPage(ji, 0); } - public IndexJournalReader createReader(EntryDataWithWordMeta... entries) throws IOException { - Path jf = Files.createTempFile(tempDir, "journal", ".dat"); + public IndexJournalPage createReader(EntryDataWithWordMeta... entries) throws IOException { + Path ji = Files.createTempDirectory(tempDir, "journal"); - var writer = new IndexJournalWriterSingleFileImpl(jf); + var writer = new IndexJournalSlopWriter(ji, 0); for (var entry : entries) { - long[] termIds = new long[entry.wordIds.length]; - long[] meta = new long[entry.wordIds.length]; + String[] termIds = new String[entry.wordIds.length]; + byte[] meta = new byte[entry.wordIds.length]; GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length]; for (int i = 0; i < entry.wordIds.length; i++) { termIds[i] = entry.wordIds[i].wordId; @@ -96,11 +107,25 @@ public class TestJournalFactory { positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1])); } - writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta), - new IndexJournalEntryData(termIds, meta, positions)); + writer.put( + entry.docId, + new SlopDocumentRecord.KeywordsProjection( + "test", + -1, + 0, + entry.docMeta, + 15, + Arrays.asList(termIds), + meta, + Arrays.asList(positions), + new byte[0], + List.of() + ) + ); + } writer.close(); - var ret = new IndexJournalReaderSingleFile(jf); - return ret; + + return new IndexJournalPage(ji, 0); } } diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java index f37b5975..d325e029 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java @@ -2,8 +2,8 @@ package nu.marginalia.index.construction.full; import java.util.Arrays; -record TestSegmentData(long wordId, long start, long end, long[] data) { - public TestSegmentData(long wordId, long start, long end) { +record TestSegmentData(String wordId, long start, long end, long[] data) { + public TestSegmentData(String wordId, long start, long end) { this(wordId, start, end, null); } @@ -22,7 +22,7 @@ record TestSegmentData(long wordId, long start, long end, long[] data) { @Override public int hashCode() { - int result = (int) (wordId ^ (wordId >>> 32)); + int result = wordId.hashCode(); result = 31 * result + (int) (start ^ (start >>> 32)); result = 31 * result + (int) (end ^ (end >>> 32)); result = 31 * result + Arrays.hashCode(data); diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java index 413b5b8b..6075fa8a 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java @@ -1,6 +1,7 @@ package nu.marginalia.index.construction.prio; import nu.marginalia.array.page.LongQueryBuffer; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.PrioReverseIndexReader; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.full.TestJournalFactory; @@ -17,7 +18,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Random; -import static nu.marginalia.index.construction.full.TestJournalFactory.*; +import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta; import static nu.marginalia.index.construction.full.TestJournalFactory.wm; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -59,6 +60,11 @@ class PrioPreindexTest { Files.delete(tempDir); } + MurmurHash3_128 hash = new MurmurHash3_128(); + long termId(String keyword) { + return hash.hashKeyword(keyword); + } + @Test public void testFinalizeSimple() throws IOException { var journalReader = journalFactory.createReader( @@ -79,7 +85,7 @@ class PrioPreindexTest { var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile); - var entrySource = indexReader.documents(50); + var entrySource = indexReader.documents(termId("50")); var lqb = new LongQueryBuffer(32); entrySource.read(lqb); @@ -139,10 +145,10 @@ class PrioPreindexTest { var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile); - int items = indexReader.numDocuments(50); + int items = indexReader.numDocuments(termId("50")); assertEquals(documentIds.length, items); - var entrySource = indexReader.documents(50); + var entrySource = indexReader.documents(termId("50")); var lqb = new LongQueryBuffer(32); for (int pos = 0; pos < documentIds.length;) { diff --git a/code/index/index-reverse/test/nu/marginalia/test/TestUtil.java b/code/index/index-reverse/test/nu/marginalia/test/TestUtil.java deleted file mode 100644 index 8fbf6b54..00000000 --- a/code/index/index-reverse/test/nu/marginalia/test/TestUtil.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.test; - - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; - -public class TestUtil { - public static void clearTempDir(Path dir) { - if (Files.isDirectory(dir)) { - for (File f : dir.toFile().listFiles()) { - File[] files = f.listFiles(); - if (files != null) { - Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); - } - System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); - f.delete(); - } - } - System.out.println("Deleting " + dir); - dir.toFile().delete(); - } - - private static String fileSize(Path path) { - try { - long sizeBytes = Files.size(path); - - if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; - if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; - if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; - return sizeBytes + "b"; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - - private static String round(double d) { - return String.format("%.2f", d); - } -} diff --git a/code/index/java/nu/marginalia/index/IndexFactory.java b/code/index/java/nu/marginalia/index/IndexFactory.java index 14e62380..e388793f 100644 --- a/code/index/java/nu/marginalia/index/IndexFactory.java +++ b/code/index/java/nu/marginalia/index/IndexFactory.java @@ -3,11 +3,11 @@ package nu.marginalia.index; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.IndexLocations; +import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.positions.PositionsFileReader; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.index.forward.ForwardIndexFileNames; -import nu.marginalia.index.forward.ForwardIndexReader; import java.io.IOException; import java.nio.file.Files; @@ -56,7 +56,8 @@ public class IndexFactory { public ForwardIndexReader getForwardIndexReader() throws IOException { return new ForwardIndexReader( ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT), - ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT) + ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT), + ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.CURRENT) ); } diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 58a9a4b0..2b075e58 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -13,7 +13,9 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.results.*; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.api.searchquery.model.results.SearchResultSet; import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; @@ -22,9 +24,9 @@ import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexSearchBudget; import nu.marginalia.index.results.IndexResultRankingService; import nu.marginalia.index.results.model.ids.CombinedDocIdList; +import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.index.searchset.SearchSetsService; import nu.marginalia.index.searchset.SmallSearchSet; -import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.service.module.ServiceConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,7 +34,8 @@ import org.slf4j.Marker; import org.slf4j.MarkerFactory; import java.sql.SQLException; -import java.util.*; +import java.util.BitSet; +import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.Executor; import java.util.concurrent.Executors; @@ -142,7 +145,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { for (var score : rawResult.keywordScores) { rawItem.addKeywordScores( RpcResultKeywordScore.newBuilder() - .setEncodedWordMetadata(score.encodedWordMetadata()) + .setFlags(score.flags) + .setPositions(score.positionCount) .setKeyword(score.keyword) ); } diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index 7da5f74b..41c398bf 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -90,7 +90,7 @@ public class StatefulIndex { return combinedIndexReader != null; } - /** Stronger version of isAvailable() that also checks that the index is loaded */ + /** Stronger page of isAvailable() that also checks that the index is loaded */ public boolean isLoaded() { return combinedIndexReader != null && combinedIndexReader.isLoaded(); } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 997273b7..751839bd 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -1,13 +1,16 @@ package nu.marginalia.index.results; -import nu.marginalia.api.searchquery.model.compiled.*; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.QueryParams; +import nu.marginalia.index.model.SearchParameters; +import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.results.model.QuerySearchTerms; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; @@ -15,13 +18,13 @@ import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.SequenceOperations; import javax.annotation.Nullable; -import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*; +import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate; +import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate; /** This class is responsible for calculating the score of a search result. * It holds the data required to perform the scoring, as there is strong @@ -102,7 +105,7 @@ public class IndexResultScoreCalculator { } private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) { - boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent); + boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags)); int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask)); int positionsCount = intMaxMinAggregate(countsQuery, p -> p); @@ -139,27 +142,27 @@ public class IndexResultScoreCalculator { } return booleanAggregate(queryGraphScores, - docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); + flags -> meetsQueryStrategyRequirements((byte) flags, queryParams.queryStrategy())); } - private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) { + private boolean meetsQueryStrategyRequirements(byte flags, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { - return WordFlags.Site.isPresent(wordMeta); + return WordFlags.Site.isPresent(flags); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { - return WordFlags.Subjects.isPresent(wordMeta); + return WordFlags.Subjects.isPresent(flags); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { - return WordFlags.Title.isPresent(wordMeta); + return WordFlags.Title.isPresent(flags); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { - return WordFlags.UrlPath.isPresent(wordMeta); + return WordFlags.UrlPath.isPresent(flags); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { - return WordFlags.UrlDomain.isPresent(wordMeta); + return WordFlags.UrlDomain.isPresent(flags); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { - return WordFlags.ExternalLink.isPresent(wordMeta); + return WordFlags.ExternalLink.isPresent(flags); } return true; } diff --git a/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java b/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java index 226ca9ae..671ee8db 100644 --- a/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java +++ b/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java @@ -13,10 +13,8 @@ import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.positions.TermData; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.linkdb.docs.DocumentDbReader; @@ -27,9 +25,10 @@ import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.service.server.Initialization; import nu.marginalia.storage.FileStorageService; @@ -63,7 +62,7 @@ public class CombinedIndexReaderTest { StatefulIndex statefulIndex; @Inject - IndexJournalWriter indexJournalWriter; + IndexJournalSlopWriter indexJournalWriter; @Inject FileStorageService fileStorageService; @@ -248,7 +247,6 @@ public class CombinedIndexReaderTest { outputFileDocs, outputFileWords, outputFilePositions, - IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir); constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); @@ -268,7 +266,6 @@ public class CombinedIndexReaderTest { outputFileDocs, outputFileWords, outputFilePositions, - IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir); @@ -279,12 +276,14 @@ public class CombinedIndexReaderTest { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, - IndexJournalReader.paging(workDir), outputFileDocsId, outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), domainRankings ); @@ -318,19 +317,26 @@ public class CombinedIndexReaderTest { var meta = metaByDoc.get(doc); - var header = new IndexJournalEntryHeader( - doc, - meta.features, - 100, - meta.documentMetadata.encode() - ); + List keywords = words.stream().map(w -> w.keyword).toList(); + byte[] metadata = new byte[words.size()]; + for (int i = 0; i < words.size(); i++) { + metadata[i] = words.get(i).termMetadata; + } + var positions = words.stream().map(w -> w.positions).map(pos -> (CodedSequence) GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toList(); - String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new); - long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray(); - var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toArray(GammaCodedSequence[]::new); - - indexJournalWriter.put(header, - new IndexJournalEntryData(keywords, metadata, positions)); + indexJournalWriter.put(doc, + new SlopDocumentRecord.KeywordsProjection( + "", + -1, + meta.features, + meta.documentMetadata.encode(), + 100, + keywords, + metadata, + positions, + new byte[0], + List.of() + )); }); var linkdbWriter = new DocumentDbWriter( @@ -370,10 +376,10 @@ public class CombinedIndexReaderTest { } record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {} - record MockDataKeyword(String keyword, long termMetadata, IntList positions) {} + record MockDataKeyword(String keyword, byte termMetadata, IntList positions) {} MockDataKeyword w(String keyword, WordFlags flags, int... positions) { - return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of(positions)); + return new MockDataKeyword(keyword, flags.asBit(), IntList.of(positions)); } } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index e7e8ecfd..8198e475 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -4,23 +4,18 @@ import com.google.inject.Guice; import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; -import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.index.construction.prio.PrioIndexConstructor; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.process.control.FakeProcessHeartbeat; -import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.sequence.GammaCodedSequence; -import nu.marginalia.storage.FileStorageService; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.full.FullIndexConstructor; +import nu.marginalia.index.construction.prio.PrioIndexConstructor; +import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; @@ -29,12 +24,16 @@ import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; +import nu.marginalia.storage.FileStorageService; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -70,7 +69,7 @@ public class IndexQueryServiceIntegrationSmokeTest { ServiceHeartbeat heartbeat; @Inject - IndexJournalWriter indexJournalWriter; + IndexJournalSlopWriter indexJournalWriter; @Inject FileStorageService fileStorageService; @@ -296,7 +295,6 @@ public class IndexQueryServiceIntegrationSmokeTest { outputFileDocs, outputFileWords, outputFilePositions, - IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir); @@ -316,7 +314,6 @@ public class IndexQueryServiceIntegrationSmokeTest { var constructor = new PrioIndexConstructor( outputFileDocs, outputFileWords, - IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir); @@ -327,12 +324,14 @@ public class IndexQueryServiceIntegrationSmokeTest { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, - IndexJournalReader.paging(workDir), outputFileDocsId, outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), domainRankings ); @@ -354,32 +353,44 @@ public class IndexQueryServiceIntegrationSmokeTest { long fullId = fullId(id); - var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); - ldbw.add(new DocdbUrlDetail( fullId, new EdgeUrl("https://www.example.com/"+id), "test", "test", 0., "HTML5", 0, null, 0, 10 )); - String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new); - long[] metadata = new long[factors.length]; + List keywords = IntStream.of(factors).mapToObj(Integer::toString).toList(); + byte[] metadata = new byte[factors.length]; for (int i = 0; i < factors.length; i++) { - metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); - } - GammaCodedSequence[] positions = new GammaCodedSequence[factors.length]; - ByteBuffer wa = ByteBuffer.allocate(32); - for (int i = 0; i < factors.length; i++) { - positions[i] = GammaCodedSequence.generate(wa, factors); + metadata[i] = WordFlags.Title.asBit(); } - indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions)); + List positions = new ArrayList<>(); + + ByteBuffer wa = ByteBuffer.allocate(32); + for (int i = 0; i < factors.length; i++) { + positions.add(GammaCodedSequence.generate(wa, factors)); + } + + indexJournalWriter.put(fullId, + new SlopDocumentRecord.KeywordsProjection( + "", + -1, + 0, + new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode(), + 100, + keywords, + metadata, + positions, + new byte[0], + List.of() + )); + } @SneakyThrows public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) { int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); long fullId = UrlIdCodec.encodeId(domain, id); - var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, DocumentMetadata.defaultValue()); ldbw.add(new DocdbUrlDetail( fullId, new EdgeUrl("https://www.example.com/"+id), @@ -387,18 +398,33 @@ public class IndexQueryServiceIntegrationSmokeTest { )); - String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new); - long[] metadata = new long[factors.length]; + List keywords = IntStream.of(factors).mapToObj(Integer::toString).toList(); + byte[] metadata = new byte[factors.length]; for (int i = 0; i < factors.length; i++) { - metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); - } - GammaCodedSequence[] positions = new GammaCodedSequence[factors.length]; - ByteBuffer wa = ByteBuffer.allocate(16); - for (int i = 0; i < factors.length; i++) { - positions[i] = GammaCodedSequence.generate(wa, i + 1); + metadata[i] = WordFlags.Title.asBit(); } - indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions)); + List positions = new ArrayList<>(); + + ByteBuffer wa = ByteBuffer.allocate(32); + for (int i = 0; i < factors.length; i++) { + positions.add(GammaCodedSequence.generate(wa, i + 1)); + } + + indexJournalWriter.put(fullId, + new SlopDocumentRecord.KeywordsProjection( + "", + -1, + 0, + new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode(), + 100, + keywords, + metadata, + positions, + new byte[0], + List.of() + )); + } } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java index 6155ab83..9cb16270 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java @@ -5,22 +5,19 @@ import com.google.inject.Inject; import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.IndexLocations; import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; -import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.index.construction.full.FullIndexConstructor; -import nu.marginalia.index.construction.prio.PrioIndexConstructor; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.sequence.GammaCodedSequence; -import nu.marginalia.storage.FileStorageService; import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.full.FullIndexConstructor; +import nu.marginalia.index.construction.prio.PrioIndexConstructor; +import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; @@ -33,12 +30,14 @@ import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; +import nu.marginalia.storage.FileStorageService; import org.apache.logging.log4j.util.Strings; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -76,7 +75,7 @@ public class IndexQueryServiceIntegrationTest { ServiceHeartbeat heartbeat; @Inject - IndexJournalWriter indexJournalWriter; + IndexJournalSlopWriter indexJournalWriter; @Inject FileStorageService fileStorageService; @@ -475,7 +474,6 @@ public class IndexQueryServiceIntegrationTest { outputFileDocs, outputFileWords, outputFilePositions, - IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir); constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); @@ -493,7 +491,6 @@ public class IndexQueryServiceIntegrationTest { var constructor = new PrioIndexConstructor( outputFileDocs, outputFileWords, - IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir); @@ -504,12 +501,14 @@ public class IndexQueryServiceIntegrationTest { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, - IndexJournalReader.paging(workDir), outputFileDocsId, outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), domainRankings ); @@ -539,24 +538,32 @@ public class IndexQueryServiceIntegrationTest { var meta = metaByDoc.get(doc); - var header = new IndexJournalEntryHeader( - doc, - meta.features, - 100, - meta.documentMetadata.encode() - ); + List keywords = words.stream().map(w -> w.keyword).toList(); - String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new); - long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray(); - - GammaCodedSequence[] positions = new GammaCodedSequence[words.size()]; // FIXME: positions? - ByteBuffer workBuffer = ByteBuffer.allocate(8192); - for (int i = 0; i < positions.length; i++) { - positions[i] = GammaCodedSequence.generate(workBuffer, words.get(i).positions); + byte[] metadata = new byte[keywords.size()]; + for (int i = 0; i < words.size(); i++) { + metadata[i] = (byte) words.get(i).termMetadata; } - indexJournalWriter.put(header, - new IndexJournalEntryData(keywords, metadata, positions)); + List positions = new ArrayList<>(); + ByteBuffer workBuffer = ByteBuffer.allocate(8192); + for (int i = 0; i < words.size(); i++) { + positions.add(GammaCodedSequence.generate(workBuffer, words.get(i).positions)); + } + + indexJournalWriter.put(doc, + new SlopDocumentRecord.KeywordsProjection( + "", + -1, + meta.features, + meta.documentMetadata.encode(), + 100, + keywords, + metadata, + positions, + new byte[0], + List.of() + )); }); var linkdbWriter = new DocumentDbWriter( @@ -599,8 +606,8 @@ public class IndexQueryServiceIntegrationTest { record MockDataKeyword(String keyword, long termMetadata, IntList positions) {} public MockDataKeyword w(String keyword, EnumSet wordFlags, int... positions) { - return new MockDataKeyword(keyword, new WordMetadata(0, wordFlags).encode(), IntList.of(positions)); + return new MockDataKeyword(keyword, WordFlags.encode(wordFlags), IntList.of(positions)); } public MockDataKeyword w(String keyword) { return new MockDataKeyword(keyword, 0L, IntList.of()); } - public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of()); } + public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, flags.asBit(), IntList.of()); } } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTestModule.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTestModule.java index e61c42d7..e2438709 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTestModule.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTestModule.java @@ -2,21 +2,23 @@ package nu.marginalia.index; import com.google.inject.AbstractModule; import nu.marginalia.IndexLocations; +import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.searchset.SearchSetAny; import nu.marginalia.index.searchset.SearchSetsService; -import nu.marginalia.index.util.TestUtil; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBase; -import nu.marginalia.storage.model.FileStorageBaseType; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.service.control.*; import nu.marginalia.service.ServiceId; +import nu.marginalia.service.control.FakeServiceHeartbeat; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageBase; +import nu.marginalia.storage.model.FileStorageBaseType; +import nu.marginalia.test.TestUtil; import org.mockito.Mockito; import java.io.IOException; @@ -41,8 +43,10 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { slowDir = workDir.resolve("slow"); fastDir = workDir.resolve("fast"); + Files.createDirectory(slowDir); Files.createDirectory(fastDir); + Files.createDirectory(fastDir.resolve("iw")); } public void cleanUp() { @@ -75,9 +79,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); - bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterPagingImpl( - IndexLocations.getIndexConstructionArea(fileStorageServiceMock) - )); + bind(IndexJournalSlopWriter.class).toInstance(new IndexJournalSlopWriter(IndexJournal.allocateName(fastDir.resolve("iw")), 0)); bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( ServiceId.Index, diff --git a/code/index/test/nu/marginalia/index/util/TestUtil.java b/code/index/test/nu/marginalia/index/util/TestUtil.java deleted file mode 100644 index 651dd316..00000000 --- a/code/index/test/nu/marginalia/index/util/TestUtil.java +++ /dev/null @@ -1,44 +0,0 @@ -package nu.marginalia.index.util; - - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; - -public class TestUtil { - public static void clearTempDir(Path path) { - if (Files.isDirectory(path)) { - for (File f : path.toFile().listFiles()) { - File[] files = f.listFiles(); - if (files != null) { - Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); - } - System.out.println("Deleting " + f); - f.delete(); - } - } - - System.out.println("Deleting " + path + " (" + fileSize(path) + ")"); - path.toFile().delete(); - } - - private static String fileSize(Path path) { - try { - long sizeBytes = Files.size(path); - - if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; - if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; - if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; - return sizeBytes + "b"; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - - private static String round(double d) { - return String.format("%.2f", d); - } -} diff --git a/code/libraries/array/build.gradle b/code/libraries/array/build.gradle index 4c88a870..862f3a69 100644 --- a/code/libraries/array/build.gradle +++ b/code/libraries/array/build.gradle @@ -26,6 +26,8 @@ dependencies { testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + + testImplementation project(':code:libraries:test-helpers') } jmh { diff --git a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortNTest.java b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortNTest.java index dfbf555e..a866264d 100644 --- a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortNTest.java +++ b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortNTest.java @@ -4,7 +4,7 @@ import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.util.test.TestUtil; +import nu.marginalia.test.TestUtil; import org.apache.commons.lang3.ArrayUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; @@ -13,7 +13,7 @@ import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.Random; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; diff --git a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortTest.java b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortTest.java index 2cfde5a7..4619b6a9 100644 --- a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortTest.java +++ b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortTest.java @@ -3,7 +3,7 @@ package nu.marginalia.array.algo; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.util.test.TestUtil; +import nu.marginalia.test.TestUtil; import org.apache.commons.lang3.ArrayUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; diff --git a/code/libraries/array/test/nu/marginalia/util/test/TestUtil.java b/code/libraries/array/test/nu/marginalia/util/test/TestUtil.java deleted file mode 100644 index e3defec1..00000000 --- a/code/libraries/array/test/nu/marginalia/util/test/TestUtil.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.util.test; - - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; - -public class TestUtil { - public static void clearTempDir(Path dir) { - if (Files.isDirectory(dir)) { - for (File f : dir.toFile().listFiles()) { - File[] files = f.listFiles(); - if (files != null) { - Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); - } - System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); - f.delete(); - } - } - System.out.println("Deleting " + dir); - dir.toFile().delete(); - } - - private static String fileSize(Path path) { - try { - long sizeBytes = Files.size(path); - - if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; - if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; - if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; - return sizeBytes + "b"; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - - private static String round(double d) { - return String.format("%.2f", d); - } -} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/CodedSequence.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/CodedSequence.java index 1a543f69..c22623ca 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/CodedSequence.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/CodedSequence.java @@ -1,12 +1,11 @@ package nu.marginalia.sequence; -import blue.strategic.parquet.BinarySerializable; import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntList; import java.nio.ByteBuffer; -public interface CodedSequence extends BinarySerializable { +public interface CodedSequence { byte[] bytes(); IntIterator iterator(); diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java index 00ae3b23..00fcf097 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java @@ -158,7 +158,7 @@ public class GammaCodedSequence implements Iterable, CodedSequence { last = i; // can't encode zeroes - assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values"; + assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values; was " + sequence; writer.putGamma(delta); } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java index bc26e93e..51396990 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java @@ -1,21 +1,24 @@ package nu.marginalia.language.sentence.tag; public enum HtmlTag { - SCRIPT(true, false), - STYLE(true, false), - CODE(false, true), - PRE(false, true), - TITLE(false, false), - HEADING(false, false), - NAV(false, false), - HEADER(false, false), - FOOTER(false, false); + SCRIPT('s', true, false), + STYLE('S', true, false), + CODE('c', false, true), + PRE('p', false, true), + TITLE('t', false, false), + HEADING('h', false, false), + NAV('n', false, false), + HEADER('H',false, false), + FOOTER('f', false, false); + public char code; public boolean exclude; public boolean nonLanguage; - HtmlTag(boolean exclude, boolean nonLanguage) { + HtmlTag(char code, boolean exclude, boolean nonLanguage) { + this.code = code; this.exclude = exclude; this.nonLanguage = nonLanguage; } + } diff --git a/code/libraries/slop/build.gradle b/code/libraries/slop/build.gradle index 2ea970ad..4a7c951a 100644 --- a/code/libraries/slop/build.gradle +++ b/code/libraries/slop/build.gradle @@ -15,6 +15,8 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation libs.bundles.slf4j + implementation project(':code:libraries:coded-sequence') + implementation libs.notnull implementation libs.commons.lang3 implementation libs.fastutil @@ -22,6 +24,7 @@ dependencies { implementation libs.guava implementation libs.commons.compress + testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java new file mode 100644 index 00000000..55e19f80 --- /dev/null +++ b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java @@ -0,0 +1,121 @@ +package nu.marginalia.slop.column.dynamic; + +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.slop.desc.ColumnDesc; +import nu.marginalia.slop.desc.ColumnFunction; +import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.StorageType; +import nu.marginalia.slop.storage.Storage; +import nu.marginalia.slop.storage.StorageReader; +import nu.marginalia.slop.storage.StorageWriter; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Path; + +public class GammaCodedSequenceColumn { + + public static GammaCodedSequenceReader open(Path path, ColumnDesc name) throws IOException { + return new Reader( + Storage.reader(path, name, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment + VarintColumn.open(path, name.createDerivative(ColumnFunction.DATA_LEN, + ColumnType.VARINT_LE, + StorageType.PLAIN) + ) + ); + } + + public static GammaCodedSequenceWriter create(Path path, ColumnDesc name) throws IOException { + return new Writer( + Storage.writer(path, name), + VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA_LEN, + ColumnType.VARINT_LE, + StorageType.PLAIN) + ) + ); + } + + private static class Writer implements GammaCodedSequenceWriter { + private final VarintColumnWriter indexWriter; + private final StorageWriter storage; + + public Writer(StorageWriter storage, + VarintColumnWriter indexWriter) + { + this.storage = storage; + + this.indexWriter = indexWriter; + } + + + @Override + public void put(GammaCodedSequence sequence) throws IOException { + var buffer = sequence.buffer(); + int length = buffer.remaining(); + + indexWriter.put(length); + storage.putBytes(buffer); + } + + public void close() throws IOException { + indexWriter.close(); + storage.close(); + } + } + + private static class Reader implements GammaCodedSequenceReader { + private final VarintColumnReader indexReader; + private final StorageReader storage; + + public Reader(StorageReader reader, VarintColumnReader indexReader) throws IOException { + this.storage = reader; + this.indexReader = indexReader; + } + + @Override + public void skip(long positions) throws IOException { + for (int i = 0; i < positions; i++) { + int size = (int) indexReader.get(); + storage.skip(size, 1); + } + } + + @Override + public boolean hasRemaining() throws IOException { + return indexReader.hasRemaining(); + } + + public long position() throws IOException { + return indexReader.position(); + } + + @Override + public GammaCodedSequence get(ByteBuffer workArea) throws IOException { + int size = (int) indexReader.get(); + + workArea.clear(); + workArea.limit(size); + storage.getBytes(workArea); + workArea.flip(); + + return new GammaCodedSequence(workArea); + } + + @Override + public void getData(ByteBuffer workArea) throws IOException { + int size = (int) indexReader.get(); + + int oldLimit = workArea.limit(); + workArea.limit(workArea.position() + size); + storage.getBytes(workArea); + workArea.limit(oldLimit); + } + + + public void close() throws IOException { + indexReader.close(); + storage.close(); + } + + } +} diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceReader.java b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceReader.java new file mode 100644 index 00000000..87b7f319 --- /dev/null +++ b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceReader.java @@ -0,0 +1,34 @@ +package nu.marginalia.slop.column.dynamic; + +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.slop.column.ColumnReader; + +import java.io.IOException; +import java.nio.ByteBuffer; + +public interface GammaCodedSequenceReader extends AutoCloseable, ColumnReader { + /** Read the next gamma-coded sequence from the column. Unlike most other + * readers, this method requires an intermediate buffer to use for reading + * the sequence. As this buffer typically needs to be fairly large to accommodate + * the largest possible sequence, it is not practical to allocate a new buffer + * for each call to this method. Instead, the caller should allocate a buffer + * once and reuse it for each call to this method. + * + * @param workArea A buffer to use for reading the sequence. + * @return The next gamma-coded sequence. + */ + CodedSequence get(ByteBuffer workArea) throws IOException; + + /** Read just the data portion of the next gamma-coded sequence from the column. + * This method is useful when the caller is only interested in the data portion + * of the sequence and does not want to decode the values. + * + * The position of the buffer is advanced to the end of the data that has just been read, + * and the limit remains the same. + * + * @param workArea A buffer to use for reading the data. + */ + void getData(ByteBuffer workArea) throws IOException; + + void close() throws IOException; +} diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceWriter.java b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceWriter.java new file mode 100644 index 00000000..7a15c37d --- /dev/null +++ b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceWriter.java @@ -0,0 +1,11 @@ +package nu.marginalia.slop.column.dynamic; + +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.slop.column.ColumnWriter; + +import java.io.IOException; + +public interface GammaCodedSequenceWriter extends AutoCloseable, ColumnWriter { + void put(GammaCodedSequence sequence) throws IOException; + void close() throws IOException; +} diff --git a/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnType.java b/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnType.java index d83096d8..92e0614a 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnType.java +++ b/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnType.java @@ -47,6 +47,7 @@ public abstract class ColumnType< public static ColumnType VARINT_LE = register("varintle", ByteOrder.LITTLE_ENDIAN, VarintColumn::open, VarintColumn::create); public static ColumnType VARINT_BE = register("varintbe", ByteOrder.BIG_ENDIAN, VarintColumn::open, VarintColumn::create); public static ColumnType BYTE_ARRAY_CUSTOM = register("s8[]+custom", ByteOrder.nativeOrder(), CustomBinaryColumn::open, CustomBinaryColumn::create); + public static ColumnType BYTE_ARRAY_GCS = register("s8[]+gcs", ByteOrder.nativeOrder(), GammaCodedSequenceColumn::open, GammaCodedSequenceColumn::create); public static ColumnType STRING = register("s8[]+str", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create); public static ColumnType CSTRING = register("s8+cstr", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create); public static ColumnType TXTSTRING = register("s8+txt", ByteOrder.nativeOrder(), StringColumn::open, StringColumn::create); diff --git a/code/tools/integration-test/test/nu/marginalia/test/TestUtil.java b/code/libraries/test-helpers/java/nu/marginalia/test/TestUtil.java similarity index 94% rename from code/tools/integration-test/test/nu/marginalia/test/TestUtil.java rename to code/libraries/test-helpers/java/nu/marginalia/test/TestUtil.java index 43332601..808dfcf7 100644 --- a/code/tools/integration-test/test/nu/marginalia/test/TestUtil.java +++ b/code/libraries/test-helpers/java/nu/marginalia/test/TestUtil.java @@ -13,7 +13,9 @@ public class TestUtil { return; if (Files.isDirectory(path)) { - for (File f : path.toFile().listFiles()) { + var contents = path.toFile().listFiles(); + + for (File f : contents) { if (f.isDirectory()) { File[] files = f.listFiles(); if (files != null) { diff --git a/code/process-models/crawl-spec/build.gradle b/code/process-models/crawl-spec/build.gradle deleted file mode 100644 index 2737e54a..00000000 --- a/code/process-models/crawl-spec/build.gradle +++ /dev/null @@ -1,32 +0,0 @@ -plugins { - id 'java' - - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation libs.bundles.slf4j - - implementation project(':third-party:parquet-floor') - implementation project(':code:common:config') - implementation project(':code:common:db') - implementation project(':code:common:linkdb') - - implementation libs.notnull - implementation libs.trove - implementation libs.bundles.parquet - implementation libs.bundles.mariadb - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - diff --git a/code/process-models/crawl-spec/readme.md b/code/process-models/crawl-spec/readme.md deleted file mode 100644 index cd59f23c..00000000 --- a/code/process-models/crawl-spec/readme.md +++ /dev/null @@ -1,16 +0,0 @@ -# Crawl Spec - -A crawl spec is a list of domains to be crawled. It is a parquet file with the following columns: - -- `domain`: The domain to be crawled -- `crawlDepth`: The depth to which the domain should be crawled -- `urls`: A list of known URLs to be crawled - -Crawl specs are used to define the scope of a crawl in the absence of known domains. - -The [CrawlSpecRecord](java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java) class is -used to represent a record in the crawl spec. - -The [CrawlSpecRecordParquetFileReader](java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java) -and [CrawlSpecRecordParquetFileWriter](java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java) -classes are used to read and write the crawl spec parquet files. diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java deleted file mode 100644 index dae53224..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.HydratorSupplier; -import blue.strategic.parquet.ParquetReader; -import nu.marginalia.model.processed.DocumentRecord; -import nu.marginalia.model.processed.DocumentRecordKeywordsProjection; -import nu.marginalia.model.processed.DocumentRecordMetadataProjection; -import org.jetbrains.annotations.NotNull; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.stream.Stream; - -public class DocumentRecordParquetFileReader { - - @NotNull - public static Stream stream(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DocumentRecord.newHydrator())); - } - - @NotNull - public static Stream streamKeywordsProjection(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DocumentRecordKeywordsProjection.newHydrator()), - DocumentRecordKeywordsProjection.requiredColumns() - ); - } - - @NotNull - public static Stream streamMetadataProjection(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DocumentRecordMetadataProjection.newHydrator()), - DocumentRecordMetadataProjection.requiredColumns() - ); - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java deleted file mode 100644 index 8e9b9657..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.ParquetWriter; -import nu.marginalia.model.processed.DocumentRecord; - -import java.io.IOException; -import java.nio.file.Path; - -public class DocumentRecordParquetFileWriter implements AutoCloseable { - private final ParquetWriter writer; - - public DocumentRecordParquetFileWriter(Path file) throws IOException { - writer = ParquetWriter.writeFile(DocumentRecord.schema, - file.toFile(), DocumentRecord.newDehydrator()); - } - - public void write(DocumentRecord documentRecord) throws IOException { - writer.write(documentRecord); - } - - public void close() throws IOException { - writer.close(); - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java deleted file mode 100644 index efa109cc..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.HydratorSupplier; -import blue.strategic.parquet.ParquetReader; -import nu.marginalia.model.processed.DomainLinkRecord; -import org.jetbrains.annotations.NotNull; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class DomainLinkRecordParquetFileReader { - @NotNull - public static Stream stream(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DomainLinkRecord.newHydrator())); - } - - @NotNull - public static Set getDestDomainNames(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DomainLinkRecord.newDestDomainHydrator()), - List.of("dest")) - .collect(Collectors.toSet()); - } - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java deleted file mode 100644 index 28cf3aa0..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.ParquetWriter; -import nu.marginalia.model.processed.DomainLinkRecord; - -import java.io.IOException; -import java.nio.file.Path; - -public class DomainLinkRecordParquetFileWriter implements AutoCloseable { - private final ParquetWriter writer; - - public DomainLinkRecordParquetFileWriter(Path file) throws IOException { - writer = ParquetWriter.writeFile(DomainLinkRecord.schema, - file.toFile(), DomainLinkRecord.newDehydrator()); - } - - public void write(DomainLinkRecord domainData) throws IOException { - writer.write(domainData); - } - - public void close() throws IOException { - writer.close(); - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java deleted file mode 100644 index a0714557..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.HydratorSupplier; -import blue.strategic.parquet.ParquetReader; -import nu.marginalia.model.processed.DomainRecord; -import nu.marginalia.model.processed.DomainWithIp; -import org.jetbrains.annotations.NotNull; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.Stream; - -public class DomainRecordParquetFileReader { - - @NotNull - public static Stream stream(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DomainRecord.newHydrator())); - } - - @NotNull - public static List getBasicDomainInformation(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()), - List.of("domain", "ip")) - .toList(); - } - - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java deleted file mode 100644 index 31c59582..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.ParquetWriter; -import nu.marginalia.model.processed.DomainRecord; - -import java.io.IOException; -import java.nio.file.Path; - -public class DomainRecordParquetFileWriter implements AutoCloseable { - private final ParquetWriter writer; - - public DomainRecordParquetFileWriter(Path file) throws IOException { - writer = ParquetWriter.writeFile(DomainRecord.schema, - file.toFile(), DomainRecord.newDehydrator()); - } - - public void write(DomainRecord domainData) throws IOException { - writer.write(domainData); - } - - public void close() throws IOException { - writer.close(); - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/ProcessedDataFileNames.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/ProcessedDataFileNames.java deleted file mode 100644 index fafb393f..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/ProcessedDataFileNames.java +++ /dev/null @@ -1,73 +0,0 @@ -package nu.marginalia.io.processed; - -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -public class ProcessedDataFileNames { - public static Path documentFileName(Path base, int batchNumber) { - return base.resolve(String.format("document%04d.parquet", batchNumber)); - } - public static Path domainFileName(Path base, int batchNumber) { - return base.resolve(String.format("domain%04d.parquet", batchNumber)); - } - public static Path domainLinkFileName(Path base, int batchNumber) { - return base.resolve(String.format("domain-link%04d.parquet", batchNumber)); - } - - public static List listDocumentFiles(Path base, int untilBatch) { - List ret = new ArrayList<>(untilBatch); - - for (int i = 0; i < untilBatch; i++) { - Path maybe = documentFileName(base, i); - if (Files.exists(maybe)) { - ret.add(maybe); - } - } - - return ret; - } - - public static List listDomainFiles(Path base, int untilBatch) { - List ret = new ArrayList<>(untilBatch); - - for (int i = 0; i < untilBatch; i++) { - Path maybe = domainFileName(base, i); - if (Files.exists(maybe)) { - ret.add(maybe); - } - } - - return ret; - } - - public static List listDomainFiles(Path base) { - List ret = new ArrayList<>(); - - for (int i = 0;; i++) { - Path maybe = domainFileName(base, i); - if (Files.exists(maybe)) { - ret.add(maybe); - } - else { - break; - } - } - - return ret; - } - - public static List listDomainLinkFiles(Path base, int untilBatch) { - List ret = new ArrayList<>(untilBatch); - - for (int i = 0; i < untilBatch; i++) { - Path maybe = domainLinkFileName(base, i); - if (Files.exists(maybe)) { - ret.add(maybe); - } - } - - return ret; - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecord.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecord.java deleted file mode 100644 index 70403c5e..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecord.java +++ /dev/null @@ -1,185 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Dehydrator; -import blue.strategic.parquet.Hydrator; -import blue.strategic.parquet.ValueWriter; -import gnu.trove.list.TLongList; -import gnu.trove.list.array.TLongArrayList; -import lombok.*; -import nu.marginalia.sequence.CodedSequence; -import nu.marginalia.sequence.GammaCodedSequence; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.Types; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import java.util.ArrayList; -import java.util.List; - -import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -@ToString -public class DocumentRecord { - @NotNull - public String domain; - @NotNull - public String url; - - public int ordinal; - - @NotNull - public String state; - @Nullable - public String stateReason; - - @Nullable - public String title; - @Nullable - public String description; - public int htmlFeatures; - @Nullable - public String htmlStandard; - - public int length; - public long hash; - public float quality; - - public long documentMetadata; - - @Nullable - public Integer pubYear; - - @Nullable - public List words; - @Nullable - public TLongList metas; - @Nullable - public List positions; - - public static Hydrator newHydrator() { - return new DocumentDataHydrator(); - } - - public static Dehydrator newDehydrator() { - return DocumentRecord::dehydrate; - } - - public static MessageType schema = new MessageType( - DocumentRecord.class.getSimpleName(), - Types.required(BINARY).as(stringType()).named("domain"), - Types.required(BINARY).as(stringType()).named("url"), - Types.required(INT32).named("ordinal"), - Types.required(BINARY).as(stringType()).named("state"), - Types.optional(BINARY).as(stringType()).named("stateReason"), - Types.optional(BINARY).as(stringType()).named("title"), - Types.optional(BINARY).as(stringType()).named("description"), - Types.optional(INT32).named("htmlFeatures"), - Types.optional(BINARY).as(stringType()).named("htmlStandard"), - Types.optional(INT64).named("hash"), - Types.optional(INT64).named("documentMetadata"), - Types.optional(INT32).named("length"), - Types.optional(FLOAT).named("quality"), - Types.optional(INT32).named("pubYear"), - Types.repeated(INT64).named("wordMeta"), - Types.repeated(BINARY).named("positions"), - Types.repeated(BINARY).as(stringType()).named("word") - ); - - @SneakyThrows - public DocumentRecord add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "url" -> url = (String) value; - case "ordinal" -> ordinal = (Integer) value; - case "htmlFeatures" -> htmlFeatures = (Integer) value; - case "length" -> length = (Integer) value; - case "pubYear" -> pubYear = (Integer) value; - case "hash" -> hash = (Long) value; - case "documentMetadata" -> documentMetadata = (Long) value; - case "quality" -> quality = (Float) value; - case "state" -> state = (String) value; - case "stateReason" -> stateReason = (String) value; - case "title" -> title = (String) value; - case "description" -> description = (String) value; - case "htmlStandard" -> htmlStandard = (String) value; - case "word" -> { - if (this.words == null) - this.words = new ArrayList<>(100); - this.words.add((String) value); - } - case "wordMeta" -> { - if (this.metas == null) { - this.metas = new TLongArrayList(100); - } - this.metas.add((long) value); - } - case "positions" -> { - if (this.positions == null) { - this.positions = new ArrayList<>(100); - } - this.positions.add(new GammaCodedSequence((byte[]) value)); - } - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - - public void dehydrate(ValueWriter valueWriter) { - valueWriter.write("domain", domain); - valueWriter.write("url", url); - valueWriter.write("ordinal", ordinal); - valueWriter.write("state", state); - - if (stateReason != null) - valueWriter.write("stateReason", stateReason); - if (title != null) - valueWriter.write("title", title); - if (description != null) - valueWriter.write("description", description); - valueWriter.write("htmlFeatures", htmlFeatures); - valueWriter.write("htmlStandard", htmlStandard); - valueWriter.write("documentMetadata", documentMetadata); - valueWriter.write("length", length); - valueWriter.write("hash", hash); - valueWriter.write("quality", quality); - if (pubYear != null) { - valueWriter.write("pubYear", pubYear); - } - if (metas != null) { - valueWriter.writeList("wordMeta", metas); - } - if (positions != null) { - valueWriter.writeBinarySerializableList("positions", positions); - } - - if (words != null) { - valueWriter.writeList("word", words); - } - } - -} - -class DocumentDataHydrator implements Hydrator { - - @Override - public DocumentRecord start() { - return new DocumentRecord(); - } - - @Override - public DocumentRecord add(DocumentRecord target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DocumentRecord finish(DocumentRecord target) { - return target; - } - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java deleted file mode 100644 index 5940de7b..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java +++ /dev/null @@ -1,97 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Hydrator; -import gnu.trove.list.TLongList; -import gnu.trove.list.array.TLongArrayList; -import lombok.*; -import nu.marginalia.sequence.CodedSequence; -import nu.marginalia.sequence.GammaCodedSequence; -import org.jetbrains.annotations.NotNull; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -@ToString -public class DocumentRecordKeywordsProjection { - @NotNull - public String domain; - - public int ordinal; - - public int htmlFeatures; - public long documentMetadata; - - public int length; - - public List words; - public TLongList metas; - public List positions; - - public boolean hasKeywords() { - return words != null && metas != null; - } - - public static Hydrator newHydrator() { - return new DocumentRecordKeywordsProjectionHydrator(); - } - - public static Collection requiredColumns() { - return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata", "length", "positions"); - } - - @SneakyThrows - public DocumentRecordKeywordsProjection add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "length" -> length = (Integer) value; - case "ordinal" -> ordinal = (Integer) value; - case "htmlFeatures" -> htmlFeatures = (Integer) value; - case "documentMetadata" -> documentMetadata = (Long) value; - case "word" -> { - if (this.words == null) - this.words = new ArrayList<>(100); - this.words.add((String) value); - } - case "wordMeta" -> { - if (this.metas == null) { - this.metas = new TLongArrayList(100); - } - this.metas.add((long) value); - } - case "positions" -> { - if (this.positions == null) { - this.positions = new ArrayList<>(100); - } - this.positions.add(new GammaCodedSequence((byte[]) value)); - } - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - -} - -class DocumentRecordKeywordsProjectionHydrator implements Hydrator { - - @Override - public DocumentRecordKeywordsProjection start() { - return new DocumentRecordKeywordsProjection(); - } - - @Override - public DocumentRecordKeywordsProjection add(DocumentRecordKeywordsProjection target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DocumentRecordKeywordsProjection finish(DocumentRecordKeywordsProjection target) { - return target; - } - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java deleted file mode 100644 index ccad52e3..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java +++ /dev/null @@ -1,100 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Hydrator; -import lombok.*; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import java.util.Collection; -import java.util.List; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -@ToString -public class DocumentRecordMetadataProjection { - @NotNull - public String domain; - @NotNull - public String url; - - public int ordinal; - - @NotNull - public String state; - @Nullable - public String stateReason; - - @Nullable - public String title; - @Nullable - public String description; - public int htmlFeatures; - @Nullable - public String htmlStandard; - - public int length; - public long hash; - public float quality; - - public long documentMetadata; - - @Nullable - public Integer pubYear; - - public static Collection requiredColumns() { - return List.of("domain", "url", "ordinal", "htmlFeatures", "length", "pubYear", - "hash", "documentMetadata", "quality", "state", "stateReason", - "title", "description", "htmlStandard"); - } - - public DocumentRecordMetadataProjection add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "url" -> url = (String) value; - case "ordinal" -> ordinal = (Integer) value; - case "htmlFeatures" -> htmlFeatures = (Integer) value; - case "length" -> length = (Integer) value; - case "pubYear" -> pubYear = (Integer) value; - case "hash" -> hash = (Long) value; - case "documentMetadata" -> documentMetadata = (Long) value; - case "quality" -> quality = (Float) value; - case "state" -> state = (String) value; - case "stateReason" -> stateReason = (String) value; - case "title" -> title = (String) value; - case "description" -> description = (String) value; - case "htmlStandard" -> htmlStandard = (String) value; - - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - - public static Hydrator newHydrator() { - return new DocumentRecordMetadataHydrator(); - } - - - -} - -class DocumentRecordMetadataHydrator implements Hydrator { - - @Override - public DocumentRecordMetadataProjection start() { - return new DocumentRecordMetadataProjection(); - } - - @Override - public DocumentRecordMetadataProjection add(DocumentRecordMetadataProjection target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DocumentRecordMetadataProjection finish(DocumentRecordMetadataProjection target) { - return target; - } - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainLinkRecord.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainLinkRecord.java deleted file mode 100644 index 298d6192..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainLinkRecord.java +++ /dev/null @@ -1,97 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Dehydrator; -import blue.strategic.parquet.Hydrator; -import blue.strategic.parquet.ValueWriter; -import lombok.*; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.Types; -import org.jetbrains.annotations.NotNull; - -import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -public class DomainLinkRecord { - @NotNull - public String source; - - @NotNull - public String dest; - - public void dehydrate(ValueWriter valueWriter) { - valueWriter.write("source", source); - valueWriter.write("dest", dest); - } - - public static Dehydrator newDehydrator() { - return DomainLinkRecord::dehydrate; - } - - public static Hydrator newHydrator() { - return new DomainLinkDataHydrator(); - } - public static Hydrator newDestDomainHydrator() { - return new DestDomainNameHydrator(); - } - - public static MessageType schema = new MessageType( - DomainLinkRecord.class.getSimpleName(), - Types.required(BINARY).as(stringType()).named("source"), - Types.required(BINARY).as(stringType()).named("dest") - ); - - public DomainLinkRecord add(String heading, Object value) { - switch (heading) { - case "source" -> source = (String) value; - case "dest" -> dest = (String) value; - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - -} - -class DomainLinkDataHydrator implements Hydrator { - - @Override - public DomainLinkRecord start() { - return new DomainLinkRecord(); - } - - @Override - public DomainLinkRecord add(DomainLinkRecord target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DomainLinkRecord finish(DomainLinkRecord target) { - return target; - } - -} - -class DestDomainNameHydrator implements Hydrator { - - @Override - public String start() { - return ""; - } - - @Override - public String add(String target, String heading, Object value) { - if ("dest".equals(heading)) { - return (String) value; - } - return target; - } - - @Override - public String finish(String target) { - return target; - } -} \ No newline at end of file diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainRecord.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainRecord.java deleted file mode 100644 index b696829f..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainRecord.java +++ /dev/null @@ -1,148 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Dehydrator; -import blue.strategic.parquet.Hydrator; -import blue.strategic.parquet.ValueWriter; -import lombok.*; -import org.apache.parquet.schema.*; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import java.util.ArrayList; -import java.util.List; - -import static org.apache.parquet.schema.LogicalTypeAnnotation.*; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -@ToString -public class DomainRecord { - @NotNull - public String domain; - - public int knownUrls; - public int goodUrls; - public int visitedUrls; - - @Nullable - public String state; - @Nullable - public String redirectDomain; - @Nullable - public String ip; - - public List rssFeeds; - - - public static Hydrator newHydrator() { - return new DomainHydrator(); - } - - public static Dehydrator newDehydrator() { - return DomainRecord::dehydrate; - } - - public static Hydrator newDomainNameHydrator() { - return new DomainWithIpHydrator(); - } - - - public static MessageType schema = new MessageType( - DomainRecord.class.getSimpleName(), - Types.required(BINARY).as(stringType()).named("domain"), - Types.optional(INT32).named("knownUrls"), - Types.optional(INT32).named("visitedUrls"), - Types.optional(INT32).named("goodUrls"), - Types.required(BINARY).as(stringType()).named("state"), - Types.optional(BINARY).as(stringType()).named("redirectDomain"), - Types.optional(BINARY).as(stringType()).named("ip"), - Types.repeated(BINARY).as(stringType()).named("rss") - ); - - DomainRecord add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "knownUrls" -> knownUrls = (Integer) value; - case "visitedUrls" -> visitedUrls = (Integer) value; - case "goodUrls" -> goodUrls = (Integer) value; - case "state" -> state = (String) value; - case "redirectDomain" -> redirectDomain = (String) value; - case "ip" -> ip = (String) value; - case "rss" -> { - if (rssFeeds == null) { - rssFeeds = new ArrayList<>(); - } - rssFeeds.add((String) value); - } - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - - private void dehydrate(ValueWriter valueWriter) { - valueWriter.write("domain", domain); - valueWriter.write("knownUrls", knownUrls); - valueWriter.write("goodUrls", goodUrls); - valueWriter.write("visitedUrls", visitedUrls); - if (state != null) { - valueWriter.write("state", state); - } - if (redirectDomain != null) { - valueWriter.write("redirectDomain", redirectDomain); - } - if (ip != null) { - valueWriter.write("ip", ip); - } - if (rssFeeds != null) { - valueWriter.writeList("rss", rssFeeds); - } - } - -} - - -class DomainHydrator implements Hydrator { - @Override - public DomainRecord start() { - return new DomainRecord(); - } - - @Override - public DomainRecord add(DomainRecord target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DomainRecord finish(DomainRecord target) { - return target; - } -} - -class DomainWithIpHydrator implements Hydrator { - - @Override - public DomainWithIp start() { - return new DomainWithIp(); - } - - @Override - public DomainWithIp add(DomainWithIp target, String heading, Object value) { - if ("domain".equals(heading)) { - target.domain = (String) value; - } - else if ("ip".equals(heading)) { - target.ip = (String) value; - } - return target; - } - - @Override - public DomainWithIp finish(DomainWithIp target) { - return target; - } -} \ No newline at end of file diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainWithIp.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainWithIp.java deleted file mode 100644 index 3782b1b2..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainWithIp.java +++ /dev/null @@ -1,15 +0,0 @@ -package nu.marginalia.model.processed; - -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.ToString; - -@AllArgsConstructor -@NoArgsConstructor -@EqualsAndHashCode -@ToString -public class DomainWithIp { - public String domain; - public String ip; -} diff --git a/code/process-models/processed-data/test/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java b/code/process-models/processed-data/test/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java deleted file mode 100644 index 21cc7e2b..00000000 --- a/code/process-models/processed-data/test/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java +++ /dev/null @@ -1,107 +0,0 @@ -package nu.marginalia.io.processed; - -import gnu.trove.list.array.TLongArrayList; -import nu.marginalia.model.processed.DocumentRecord; -import nu.marginalia.sequence.CodedSequence; -import nu.marginalia.sequence.GammaCodedSequence; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.IntStream; -import java.util.stream.LongStream; -import java.util.stream.Stream; - -import static org.junit.jupiter.api.Assertions.*; - -class DocumentRecordParquetFileReaderTest { - Path parquetFile; - - @BeforeEach - public void setUp() throws IOException { - parquetFile = Files.createTempFile(getClass().getSimpleName(), ".parquet"); - } - - @AfterEach - public void tearDown() throws IOException { - Files.deleteIfExists(parquetFile); - } - - @Test - public void test() throws IOException { - - ByteBuffer workArea = ByteBuffer.allocate(1024); - - var doc = new DocumentRecord( - "www.marginalia.nu", - "https://www.marginalia.nu/", - 0, - "OK", - null, - "Itsa me, Marginalia!", - "Hello World", - 3, - "HTML5", - 123, - 0xF00BA3L, - 0.25f, - 4L, - null, - List.of("Hello", "world"), - new TLongArrayList(new long[] { 2L, 3L}), - List.of( - GammaCodedSequence.generate(workArea, 1, 2, 3), - GammaCodedSequence.generate(workArea, 1, 4, 5) - ) - ); - - try (var writer = new DocumentRecordParquetFileWriter(parquetFile)) { - writer.write(doc); - } - - var read = DocumentRecordParquetFileReader.stream(parquetFile).toList(); - assertEquals(List.of(doc), read); - } - - @Test - public void testHugePayload() throws IOException { - List words = IntStream.range(0, 100000).mapToObj(Integer::toString).toList(); - TLongArrayList metas = new TLongArrayList(LongStream.range(0, 100000).toArray()); - - ByteBuffer workArea = ByteBuffer.allocate(1024); - List poses = Stream.generate(() -> (CodedSequence) GammaCodedSequence.generate(workArea, 3, 4)).limit(100000).toList(); - - var doc = new DocumentRecord( - "www.marginalia.nu", - "https://www.marginalia.nu/", - 0, - "OK", - null, - "Itsa me, Marginalia!", - "Hello World", - 3, - "HTML5", - 123, - 0xF00BA3L, - 0.25f, - 5L, - null, - words, - metas, - poses - ); - - try (var writer = new DocumentRecordParquetFileWriter(parquetFile)) { - writer.write(doc); - } - - var read = DocumentRecordParquetFileReader.stream(parquetFile).toList(); - assertEquals(List.of(doc), read); - } - -} \ No newline at end of file diff --git a/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java b/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java deleted file mode 100644 index 274e80d0..00000000 --- a/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java +++ /dev/null @@ -1,49 +0,0 @@ -package nu.marginalia.io.processed; - -import nu.marginalia.model.processed.DomainLinkRecord; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -class DomainLinkRecordParquetFileReaderTest { - Path parquetFile; - - @BeforeEach - public void setUp() throws IOException { - parquetFile = Files.createTempFile(getClass().getSimpleName(), ".parquet"); - } - - @AfterEach - public void tearDown() throws IOException { - Files.deleteIfExists(parquetFile); - } - - @Test - public void testReadFull() throws IOException { - var first = new DomainLinkRecord( - "www.marginalia.nu", - "memex.marginalia.nu"); - var second = new DomainLinkRecord( - "memex.marginalia.nu", - "search.marginalia.nu" - ); - - try (var writer = new DomainLinkRecordParquetFileWriter(parquetFile)) { - writer.write(first); - writer.write(second); - } - - var items = DomainLinkRecordParquetFileReader - .stream(parquetFile) - .toList(); - assertEquals(List.of(first, second), items); - } - -} \ No newline at end of file diff --git a/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java b/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java deleted file mode 100644 index b1867100..00000000 --- a/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.io.processed; - -import nu.marginalia.model.processed.DomainRecord; -import nu.marginalia.model.processed.DomainWithIp; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.*; - -class DomainRecordParquetFileReaderTest { - Path parquetFile; - - @BeforeEach - public void setUp() throws IOException { - parquetFile = Files.createTempFile(getClass().getSimpleName(), ".parquet"); - } - - @AfterEach - public void tearDown() throws IOException { - Files.deleteIfExists(parquetFile); - } - - @Test - public void testReadFull() throws IOException { - var first = new DomainRecord( - "www.marginalia.nu", - 10, - 3, - 5, - "'sall good man", - null, - "127.0.0.1", - List.of("a", "b") - ); - var second = new DomainRecord( - "memex.marginalia.nu", - 0, - 0, - 0, - "REDIRECT", - "www.marginalia.nu", - "127.0.0.1", - null - ); - - try (var writer = new DomainRecordParquetFileWriter(parquetFile)) { - writer.write(first); - writer.write(second); - } - - var domainInfo = DomainRecordParquetFileReader.getBasicDomainInformation(parquetFile); - assertEquals(List.of( - new DomainWithIp("www.marginalia.nu", "127.0.0.1"), - new DomainWithIp("memex.marginalia.nu", "127.0.0.1")), - domainInfo); - - var items = DomainRecordParquetFileReader - .stream(parquetFile) - .toList(); - assertEquals(List.of(first, second), items); - } - -} \ No newline at end of file diff --git a/code/process-models/work-log/build.gradle b/code/process-models/work-log/build.gradle deleted file mode 100644 index 76fe01f9..00000000 --- a/code/process-models/work-log/build.gradle +++ /dev/null @@ -1,24 +0,0 @@ -plugins { - id 'java' - - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation libs.bundles.slf4j - - implementation libs.notnull - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index f3e7ae1d..942c8acd 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -28,7 +28,7 @@ dependencies { implementation project(':third-party:parquet-floor') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:common:model') implementation project(':code:common:db') @@ -43,9 +43,8 @@ dependencies { implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:language-processing') - implementation project(':code:process-models:processed-data') - implementation project(':code:process-models:work-log') - implementation project(':code:process-models:crawling-model') + implementation project(':code:processes:converting-process:model') + implementation project(':code:processes:crawling-process:model') implementation project(':code:features-convert:adblock') implementation project(':code:features-convert:anchor-keywords') @@ -61,7 +60,7 @@ dependencies { implementation project(':code:features-crawl:content-type') testImplementation project(':code:libraries:term-frequency-dict') - testImplementation project(':code:process-models:crawl-spec') + testImplementation project(':code:processes:crawling-process:model') implementation libs.bundles.slf4j diff --git a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java index 83bc63f5..3a978972 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java @@ -6,33 +6,33 @@ import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfigurationModule; +import nu.marginalia.converting.model.CrawlPlan; +import nu.marginalia.converting.model.WorkDir; +import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSourceFactory; import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.converting.writer.ConverterWriter; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.process.log.WorkLog; -import nu.marginalia.process.log.WorkLogEntry; -import nu.marginalia.service.ProcessMainClass; -import nu.marginalia.storage.FileStorageService; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeatImpl; +import nu.marginalia.process.log.WorkLog; +import nu.marginalia.process.log.WorkLogEntry; +import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.storage.FileStorageService; import nu.marginalia.util.SimpleBlockingThreadPool; import nu.marginalia.worklog.BatchingWorkLog; import nu.marginalia.worklog.BatchingWorkLogImpl; import org.apache.logging.log4j.util.Strings; -import nu.marginalia.converting.model.CrawlPlan; -import nu.marginalia.converting.processor.DomainProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import nu.marginalia.converting.model.WorkDir; import java.io.IOException; import java.nio.file.Files; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/model/DisqualifiedException.java b/code/processes/converting-process/java/nu/marginalia/converting/model/DisqualifiedException.java index 11c329eb..34c6836d 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/model/DisqualifiedException.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/model/DisqualifiedException.java @@ -1,6 +1,6 @@ package nu.marginalia.converting.model; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; public class DisqualifiedException extends Exception { public final DisqualificationReason reason; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java index d097c60a..f75c35ad 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java @@ -1,6 +1,6 @@ package nu.marginalia.converting.processor; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDocument; import org.jsoup.nodes.Document; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java index 96392920..d4fac8aa 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -2,22 +2,25 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; import nu.marginalia.atags.model.DomainLinks; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin; import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin; +import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URISyntaxException; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.Set; public class DocumentProcessor { diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java index 7ec0bf29..966a6939 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -7,19 +7,21 @@ import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.converting.processor.logic.links.LinkGraph; +import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWriter; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.*; import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.geoip.sources.AsnTable; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.converting.processor.logic.links.TopKeywords; -import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.CrawlerDomainStatus; import nu.marginalia.util.ProcessingIterator; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index df409741..1c959dee 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -1,10 +1,10 @@ package nu.marginalia.converting.processor.logic; import crawlercommons.utils.Strings; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.html.HtmlStandard; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index 59b095e7..79f6aebd 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -1,20 +1,22 @@ package nu.marginalia.converting.processor.plugin; -import nu.marginalia.converting.processor.DocumentClass; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.language.filter.LanguageFilter; -import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.keyword.model.DocumentKeywordsBuilder; -import nu.marginalia.model.crawl.PubDate; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocumentDetails; -import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.converting.processor.DocumentClass; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.filter.LanguageFilter; +import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.html.HtmlStandard; import javax.annotation.Nullable; import java.net.URISyntaxException; -import java.util.*; +import java.util.HashSet; +import java.util.List; +import java.util.Set; public abstract class AbstractDocumentProcessorPlugin { protected LanguageFilter languageFilter; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 5514fee9..76b867fb 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -2,33 +2,33 @@ package nu.marginalia.converting.processor.plugin; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.language.filter.LanguageFilter; +import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.GeneratorType; +import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.MetaRobotsTag; +import nu.marginalia.converting.processor.logic.*; import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor; import nu.marginalia.converting.processor.logic.links.FileLinks; import nu.marginalia.converting.processor.logic.links.LinkProcessor; -import nu.marginalia.converting.processor.plugin.specialization.*; +import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecializations; +import nu.marginalia.gregex.GuardedRegex; +import nu.marginalia.gregex.GuardedRegexFactory; +import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; import nu.marginalia.link_parser.FeedExtractor; -import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.link_parser.LinkParser; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.keyword.DocumentKeywordExtractor; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.keyword.model.DocumentKeywordsBuilder; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.converting.processor.logic.*; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.gregex.GuardedRegex; -import nu.marginalia.gregex.GuardedRegexFactory; -import nu.marginalia.converting.model.DisqualifiedException; -import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.html.HtmlStandard; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.pubdate.PubDateSniffer; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -36,9 +36,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URISyntaxException; -import java.util.*; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.Set; -import static nu.marginalia.converting.model.DisqualifiedException.*; +import static nu.marginalia.converting.model.DisqualifiedException.DisqualificationReason; public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin { diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index 787cc8a0..c85dfeda 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -2,22 +2,22 @@ package nu.marginalia.converting.processor.plugin; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.language.filter.LanguageFilter; -import nu.marginalia.converting.processor.DocumentClass; -import nu.marginalia.converting.processor.logic.DocumentLengthLogic; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.keyword.DocumentKeywordExtractor; -import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.keyword.model.DocumentKeywordsBuilder; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.model.crawl.PubDate; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocumentDetails; +import nu.marginalia.converting.processor.DocumentClass; +import nu.marginalia.converting.processor.logic.DocumentLengthLogic; import nu.marginalia.converting.processor.logic.PlainTextLogic; -import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.util.LineUtils; +import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.filter.LanguageFilter; +import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.html.HtmlStandard; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; import org.apache.commons.lang3.StringUtils; import java.net.URISyntaxException; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 43ae0d81..84b3ab53 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -7,11 +7,11 @@ import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; -import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index 4a20543a..d110d9bd 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -1,27 +1,24 @@ package nu.marginalia.converting.writer; -import gnu.trove.list.array.TLongArrayList; import lombok.SneakyThrows; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; -import nu.marginalia.io.processed.DocumentRecordParquetFileWriter; -import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; -import nu.marginalia.io.processed.DomainRecordParquetFileWriter; import nu.marginalia.io.processed.ProcessedDataFileNames; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.processed.DocumentRecord; -import nu.marginalia.model.processed.DomainLinkRecord; -import nu.marginalia.model.processed.DomainRecord; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.model.processed.SlopDomainLinkRecord; +import nu.marginalia.model.processed.SlopDomainRecord; import nu.marginalia.sequence.CodedSequence; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.concurrent.Callable; @@ -30,22 +27,27 @@ import java.util.concurrent.Future; /** Writer for a single batch of converter parquet files */ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriterIf { - private final DomainRecordParquetFileWriter domainWriter; - private final DomainLinkRecordParquetFileWriter domainLinkWriter; - private final DocumentRecordParquetFileWriter documentWriter; + private final SlopDomainRecord.Writer domainWriter; + private final SlopDomainLinkRecord.Writer domainLinkWriter; + private final SlopDocumentRecord.Writer documentWriter; private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class); public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException { - domainWriter = new DomainRecordParquetFileWriter( - ProcessedDataFileNames.domainFileName(basePath, batchNumber) - ); - domainLinkWriter = new DomainLinkRecordParquetFileWriter( - ProcessedDataFileNames.domainLinkFileName(basePath, batchNumber) - ); - documentWriter = new DocumentRecordParquetFileWriter( - ProcessedDataFileNames.documentFileName(basePath, batchNumber) - ); + if (!Files.exists(ProcessedDataFileNames.domainFileName(basePath))) { + Files.createDirectory(ProcessedDataFileNames.domainFileName(basePath)); + } + domainWriter = new SlopDomainRecord.Writer(ProcessedDataFileNames.domainFileName(basePath), batchNumber); + + if (!Files.exists(ProcessedDataFileNames.domainLinkFileName(basePath))) { + Files.createDirectory(ProcessedDataFileNames.domainLinkFileName(basePath)); + } + domainLinkWriter = new SlopDomainLinkRecord.Writer(ProcessedDataFileNames.domainLinkFileName(basePath), batchNumber); + + if (!Files.exists(ProcessedDataFileNames.documentFileName(basePath))) { + Files.createDirectory(ProcessedDataFileNames.documentFileName(basePath)); + } + documentWriter = new SlopDocumentRecord.Writer(ProcessedDataFileNames.documentFileName(basePath), batchNumber); } @Override @@ -107,32 +109,31 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter while (documentIterator.hasNext()) { var document = documentIterator.next(); if (document.details == null) { - new DocumentRecord( + new SlopDocumentRecord( domainName, document.url.toString(), ordinal, document.state.toString(), - document.stateReason, - null, - null, - 0, - null, - 0, - 0L, - -15, - 0L, - null, - null, - null, - null); + document.stateReason); } else { var wb = document.words.build(workArea); - List words = Arrays.asList(wb.keywords); - TLongArrayList metas = new TLongArrayList(wb.metadata); - List positions = Arrays.asList(wb.positions); + List words = wb.keywords; + byte[] metas = wb.metadata; + List positions = wb.positions; - documentWriter.write(new DocumentRecord( + + List spanSequences = new ArrayList<>(wb.spans.size()); + byte[] spanCodes = new byte[wb.spans.size()]; + + for (int i = 0; i < wb.spans.size(); i++) { + var span = wb.spans.get(i); + + spanCodes[i] = span.code(); + spanSequences.add(span.spans()); + } + + documentWriter.write(new SlopDocumentRecord( domainName, document.url.toString(), ordinal, @@ -149,7 +150,9 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter document.details.pubYear, words, metas, - positions + positions, + spanCodes, + spanSequences )); } @@ -178,7 +181,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter continue; } - domainLinkWriter.write(new DomainLinkRecord( + domainLinkWriter.write(new SlopDomainLinkRecord( from, dest.toString() )); @@ -186,7 +189,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter } if (domain.redirect != null) { - domainLinkWriter.write(new DomainLinkRecord( + domainLinkWriter.write(new SlopDomainLinkRecord( from, domain.redirect.toString() )); @@ -201,13 +204,13 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter List feeds = getFeedUrls(domain); domainWriter.write( - new DomainRecord( + new SlopDomainRecord( domain.domain.toString(), metadata.known(), metadata.good(), metadata.visited(), - Optional.ofNullable(domain.state).map(DomainIndexingState::toString).orElse(null), - Optional.ofNullable(domain.redirect).map(EdgeDomain::toString).orElse(null), + Optional.ofNullable(domain.state).map(DomainIndexingState::toString).orElse(""), + Optional.ofNullable(domain.redirect).map(EdgeDomain::toString).orElse(""), domain.ip, feeds ) diff --git a/code/process-models/processed-data/build.gradle b/code/processes/converting-process/model/build.gradle similarity index 86% rename from code/process-models/processed-data/build.gradle rename to code/processes/converting-process/model/build.gradle index 21ccf221..a3fc6307 100644 --- a/code/process-models/processed-data/build.gradle +++ b/code/processes/converting-process/model/build.gradle @@ -12,9 +12,12 @@ java { } apply from: "$rootProject.projectDir/srcsets.gradle" +jar.archiveBaseName = 'converting-process-model' + dependencies { implementation libs.bundles.slf4j + implementation project(':code:libraries:slop') implementation project(':third-party:parquet-floor') implementation project(':code:libraries:coded-sequence') diff --git a/code/processes/converting-process/model/java/nu/marginalia/io/processed/ProcessedDataFileNames.java b/code/processes/converting-process/model/java/nu/marginalia/io/processed/ProcessedDataFileNames.java new file mode 100644 index 00000000..44b56bc3 --- /dev/null +++ b/code/processes/converting-process/model/java/nu/marginalia/io/processed/ProcessedDataFileNames.java @@ -0,0 +1,16 @@ +package nu.marginalia.io.processed; + +import java.nio.file.Path; + +public class ProcessedDataFileNames { + public static Path documentFileName(Path base) { + return base.resolve("document"); + } + public static Path domainFileName(Path base) { + return base.resolve("domains"); + } + public static Path domainLinkFileName(Path base) { + return base.resolve("domain-link"); + } + +} diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java new file mode 100644 index 00000000..177eaf9a --- /dev/null +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java @@ -0,0 +1,395 @@ +package nu.marginalia.model.processed; + +import lombok.Builder; +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.slop.column.array.ByteArrayColumnReader; +import nu.marginalia.slop.column.array.ByteArrayColumnWriter; +import nu.marginalia.slop.column.dynamic.GammaCodedSequenceReader; +import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter; +import nu.marginalia.slop.column.dynamic.VarintColumnReader; +import nu.marginalia.slop.column.dynamic.VarintColumnWriter; +import nu.marginalia.slop.column.primitive.*; +import nu.marginalia.slop.column.string.StringColumnReader; +import nu.marginalia.slop.column.string.StringColumnWriter; +import nu.marginalia.slop.desc.ColumnDesc; +import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.StorageType; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public record SlopDocumentRecord( + String domain, + String url, + int ordinal, + String state, + String stateReason, + String title, + String description, + int htmlFeatures, + String htmlStandard, + int length, + long hash, + float quality, + long documentMetadata, + Integer pubYear, + List words, + byte[] metas, + List positions, + byte[] spanCodes, + List spans +) { + + /** Constructor for partial records */ + public SlopDocumentRecord(String domain, + String url, + int ordinal, + String state, + String stateReason) + { + this(domain, url, ordinal, state, stateReason, "", "", 0, "", 0, 0L, 0.0f, 0L, null, List.of(), new byte[0], List.of(), new byte[0], List.of()); + } + + public SlopDocumentRecord { + if (spanCodes.length != spans.size()) + throw new IllegalArgumentException("Span codes and spans must have the same length"); + if (metas.length != words.size() || metas.length != positions.size()) + throw new IllegalArgumentException("Metas, words and positions must have the same length"); + } + + @Builder + public record KeywordsProjection( + String domain, + int ordinal, + int htmlFeatures, + long documentMetadata, + int length, + List words, + byte[] metas, + List positions, + byte[] spanCodes, + List spans) + { } + + public record MetadataProjection( + String domain, + String url, + int ordinal, + String title, + String description, + int htmlFeatures, + String htmlStandard, + int length, + long hash, + float quality, + Integer pubYear + ) { + + } + + // Basic information + private static final ColumnDesc domainsColumn = new ColumnDesc<>("domain", ColumnType.TXTSTRING, StorageType.GZIP); + private static final ColumnDesc urlsColumn = new ColumnDesc<>("url", ColumnType.TXTSTRING, StorageType.GZIP); + private static final ColumnDesc ordinalsColumn = new ColumnDesc<>("ordinal", ColumnType.VARINT_LE, StorageType.PLAIN); + private static final ColumnDesc statesColumn = new ColumnDesc<>("state", ColumnType.ENUM_LE, StorageType.PLAIN); + private static final ColumnDesc stateReasonsColumn = new ColumnDesc<>("stateReason", ColumnType.TXTSTRING, StorageType.GZIP); + + // Document metadata + private static final ColumnDesc titlesColumn = new ColumnDesc<>("title", ColumnType.STRING, StorageType.GZIP); + private static final ColumnDesc descriptionsColumn = new ColumnDesc<>("description", ColumnType.STRING, StorageType.GZIP); + private static final ColumnDesc htmlStandardsColumn = new ColumnDesc<>("htmlStandard", ColumnType.ENUM_LE, StorageType.GZIP); + private static final ColumnDesc htmlFeaturesColumn = new ColumnDesc<>("htmlFeatures", ColumnType.INT_LE, StorageType.PLAIN); + private static final ColumnDesc lengthsColumn = new ColumnDesc<>("length", ColumnType.INT_LE, StorageType.PLAIN); + private static final ColumnDesc pubYearColumn = new ColumnDesc<>("pubYear", ColumnType.INT_LE, StorageType.PLAIN); + private static final ColumnDesc hashesColumn = new ColumnDesc<>("hash", ColumnType.LONG_LE, StorageType.PLAIN); + private static final ColumnDesc qualitiesColumn = new ColumnDesc<>("quality", ColumnType.FLOAT_LE, StorageType.PLAIN); + private static final ColumnDesc domainMetadata = new ColumnDesc<>("domainMetadata", ColumnType.LONG_LE, StorageType.PLAIN); + + // Keyword-level columns, these are enumerated by the counts column + private static final ColumnDesc termCountsColumn = new ColumnDesc<>("termCounts", ColumnType.VARINT_LE, StorageType.PLAIN); + private static final ColumnDesc keywordsColumn = new ColumnDesc<>("keywords", ColumnType.STRING, StorageType.ZSTD); + private static final ColumnDesc termMetaColumn = new ColumnDesc<>("termMetadata", ColumnType.BYTE, StorageType.ZSTD); + private static final ColumnDesc termPositionsColumn = new ColumnDesc<>("termPositions", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD); + + // Spans columns + private static final ColumnDesc spanCodesColumn = new ColumnDesc<>("spanCodes", ColumnType.BYTE_ARRAY, StorageType.ZSTD); + private static final ColumnDesc spansColumn = new ColumnDesc<>("spans", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD); + + public static class KeywordsProjectionReader implements AutoCloseable { + private final StringColumnReader domainsReader; + private final VarintColumnReader ordinalsReader; + private final IntColumnReader htmlFeaturesReader; + private final LongColumnReader domainMetadataReader; + private final IntColumnReader lengthsReader; + private final StringColumnReader keywordsReader; + private final VarintColumnReader termCountsReader; + private final ByteColumnReader termMetaReader; + private final GammaCodedSequenceReader termPositionsReader; + + private final ByteArrayColumnReader spanCodesReader; + private final GammaCodedSequenceReader spansReader; + + private final ByteBuffer workBuffer = ByteBuffer.allocate(65536); + + public KeywordsProjectionReader(SlopPageRef pageRef) throws IOException { + this(pageRef.baseDir(), pageRef.page()); + } + + public KeywordsProjectionReader(Path baseDir, int page) throws IOException { + domainsReader = domainsColumn.forPage(page).open(baseDir); + ordinalsReader = ordinalsColumn.forPage(page).open(baseDir); + htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(baseDir); + domainMetadataReader = domainMetadata.forPage(page).open(baseDir); + lengthsReader = lengthsColumn.forPage(page).open(baseDir); + keywordsReader = keywordsColumn.forPage(page).open(baseDir); + termCountsReader = termCountsColumn.forPage(page).open(baseDir); + termMetaReader = termMetaColumn.forPage(page).open(baseDir); + termPositionsReader = termPositionsColumn.forPage(page).open(baseDir); + spanCodesReader = spanCodesColumn.forPage(page).open(baseDir); + spansReader = spansColumn.forPage(page).open(baseDir); + } + + public boolean hasMore() throws IOException { + return domainsReader.hasRemaining(); + } + + public KeywordsProjection next() throws IOException { + String domain = domainsReader.get(); + int ordinal = (int) ordinalsReader.get(); + int htmlFeatures = htmlFeaturesReader.get(); + long documentMetadata = domainMetadataReader.get(); + int length = lengthsReader.get(); + List words = new ArrayList<>(); + + List positions = new ArrayList<>(); + + int termCounts = (int) termCountsReader.get(); + byte[] metas = new byte[termCounts]; + + for (int i = 0; i < termCounts; i++) { + metas[i] = termMetaReader.get(); + words.add(keywordsReader.get()); + positions.add(termPositionsReader.get(workBuffer)); + } + + byte[] spanCodes = spanCodesReader.get(); + + List spans = new ArrayList<>(spanCodes.length); + for (int i = 0; i < spanCodes.length; i++) { + spans.add(spansReader.get(workBuffer)); + } + + return new KeywordsProjection( + domain, + ordinal, + htmlFeatures, + documentMetadata, + length, + words, + metas, + positions, + spanCodes, + spans + ); + } + + + public void close() throws IOException { + domainsReader.close(); + ordinalsReader.close(); + htmlFeaturesReader.close(); + domainMetadataReader.close(); + lengthsReader.close(); + keywordsReader.close(); + termMetaReader.close(); + termPositionsReader.close(); + spanCodesReader.close(); + spansReader.close(); + } + } + + public static class MetadataReader implements AutoCloseable { + private final StringColumnReader domainsReader; + private final StringColumnReader urlsReader; + private final VarintColumnReader ordinalsReader; + private final StringColumnReader titlesReader; + private final StringColumnReader descriptionsReader; + private final IntColumnReader htmlFeaturesReader; + private final StringColumnReader htmlStandardsReader; + private final IntColumnReader lengthsReader; + private final LongColumnReader hashesReader; + private final FloatColumnReader qualitiesReader; + private final IntColumnReader pubYearReader; + + public MetadataReader(SlopPageRef pageRef) throws IOException{ + this(pageRef.baseDir(), pageRef.page()); + } + + public MetadataReader(Path baseDir, int page) throws IOException { + this.domainsReader = domainsColumn.forPage(page).open(baseDir); + this.urlsReader = urlsColumn.forPage(page).open(baseDir); + this.ordinalsReader = ordinalsColumn.forPage(page).open(baseDir); + this.titlesReader = titlesColumn.forPage(page).open(baseDir); + this.descriptionsReader = descriptionsColumn.forPage(page).open(baseDir); + this.htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(baseDir); + this.htmlStandardsReader = htmlStandardsColumn.forPage(page).open(baseDir); + this.lengthsReader = lengthsColumn.forPage(page).open(baseDir); + this.hashesReader = hashesColumn.forPage(page).open(baseDir); + this.qualitiesReader = qualitiesColumn.forPage(page).open(baseDir); + this.pubYearReader = pubYearColumn.forPage(page).open(baseDir); + } + + public MetadataProjection next() throws IOException { + int pubYear = pubYearReader.get(); + return new MetadataProjection( + domainsReader.get(), + urlsReader.get(), + (int) ordinalsReader.get(), + titlesReader.get(), + descriptionsReader.get(), + htmlFeaturesReader.get(), + htmlStandardsReader.get(), + lengthsReader.get(), + hashesReader.get(), + qualitiesReader.get(), + pubYear < 0 ? null : pubYear + ); + } + + public boolean hasNext() throws IOException { + return domainsReader.hasRemaining(); + } + + public void close() throws IOException { + domainsReader.close(); + urlsReader.close(); + ordinalsReader.close(); + titlesReader.close(); + descriptionsReader.close(); + htmlFeaturesReader.close(); + htmlStandardsReader.close(); + lengthsReader.close(); + hashesReader.close(); + qualitiesReader.close(); + pubYearReader.close(); + } + } + + public static class Writer implements AutoCloseable { + private final StringColumnWriter domainsWriter; + private final StringColumnWriter urlsWriter; + private final VarintColumnWriter ordinalsWriter; + private final StringColumnWriter statesWriter; + private final StringColumnWriter stateReasonsWriter; + private final StringColumnWriter titlesWriter; + private final StringColumnWriter descriptionsWriter; + private final IntColumnWriter htmlFeaturesWriter; + private final StringColumnWriter htmlStandardsWriter; + private final IntColumnWriter lengthsWriter; + private final LongColumnWriter hashesWriter; + private final FloatColumnWriter qualitiesWriter; + private final LongColumnWriter domainMetadataWriter; + private final IntColumnWriter pubYearWriter; + private final VarintColumnWriter termCountsWriter; + private final StringColumnWriter keywordsWriter; + private final ByteColumnWriter termMetaWriter; + private final GammaCodedSequenceWriter termPositionsWriter; + private final ByteArrayColumnWriter spansCodesWriter; + private final GammaCodedSequenceWriter spansWriter; + + public Writer(Path baseDir, int page) throws IOException { + domainsWriter = domainsColumn.forPage(page).create(baseDir); + urlsWriter = urlsColumn.forPage(page).create(baseDir); + ordinalsWriter = ordinalsColumn.forPage(page).create(baseDir); + statesWriter = statesColumn.forPage(page).create(baseDir); + stateReasonsWriter = stateReasonsColumn.forPage(page).create(baseDir); + titlesWriter = titlesColumn.forPage(page).create(baseDir); + descriptionsWriter = descriptionsColumn.forPage(page).create(baseDir); + htmlFeaturesWriter = htmlFeaturesColumn.forPage(page).create(baseDir); + htmlStandardsWriter = htmlStandardsColumn.forPage(page).create(baseDir); + lengthsWriter = lengthsColumn.forPage(page).create(baseDir); + hashesWriter = hashesColumn.forPage(page).create(baseDir); + qualitiesWriter = qualitiesColumn.forPage(page).create(baseDir); + domainMetadataWriter = domainMetadata.forPage(page).create(baseDir); + pubYearWriter = pubYearColumn.forPage(page).create(baseDir); + termCountsWriter = termCountsColumn.forPage(page).create(baseDir); + keywordsWriter = keywordsColumn.forPage(page).create(baseDir); + termMetaWriter = termMetaColumn.forPage(page).create(baseDir); + termPositionsWriter = termPositionsColumn.forPage(page).create(baseDir); + + spansWriter = spansColumn.forPage(page).create(baseDir); + spansCodesWriter = spanCodesColumn.forPage(page).create(baseDir); + } + + public void write(SlopDocumentRecord record) throws IOException { + domainsWriter.put(record.domain()); + urlsWriter.put(record.url()); + ordinalsWriter.put(record.ordinal()); + statesWriter.put(record.state()); + stateReasonsWriter.put(record.stateReason()); + titlesWriter.put(record.title()); + descriptionsWriter.put(record.description()); + htmlFeaturesWriter.put(record.htmlFeatures()); + htmlStandardsWriter.put(record.htmlStandard()); + lengthsWriter.put(record.length()); + hashesWriter.put(record.hash()); + qualitiesWriter.put(record.quality()); + domainMetadataWriter.put(record.documentMetadata()); + + if (record.pubYear == null) { + pubYearWriter.put(-1); + } else { + pubYearWriter.put(record.pubYear()); + } + + byte[] termMetadata = record.metas(); + List keywords = record.words(); + List termPositions = record.positions(); + + termCountsWriter.put(termMetadata.length); + + for (int i = 0; i < termMetadata.length; i++) { + termMetaWriter.put(termMetadata[i]); + keywordsWriter.put(keywords.get(i)); + + termPositionsWriter.put((GammaCodedSequence) termPositions.get(i)); + } + + assert record.spanCodes().length == record.spans.size() : "Span codes and spans must have the same length"; + + spansCodesWriter.put(record.spanCodes()); + for (var span : record.spans) { + spansWriter.put((GammaCodedSequence) span); + } + + } + + public void close() throws IOException { + domainsWriter.close(); + urlsWriter.close(); + ordinalsWriter.close(); + statesWriter.close(); + stateReasonsWriter.close(); + titlesWriter.close(); + descriptionsWriter.close(); + htmlFeaturesWriter.close(); + htmlStandardsWriter.close(); + lengthsWriter.close(); + hashesWriter.close(); + qualitiesWriter.close(); + domainMetadataWriter.close(); + pubYearWriter.close(); + termCountsWriter.close(); + keywordsWriter.close(); + termMetaWriter.close(); + termPositionsWriter.close(); + + spansCodesWriter.close(); + spansWriter.close(); + } + } +} diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java new file mode 100644 index 00000000..d0b3c6d6 --- /dev/null +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java @@ -0,0 +1,83 @@ +package nu.marginalia.model.processed; + +import nu.marginalia.slop.column.string.StringColumnReader; +import nu.marginalia.slop.column.string.StringColumnWriter; +import nu.marginalia.slop.desc.ColumnDesc; +import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.StorageType; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.function.Consumer; + +public record SlopDomainLinkRecord( + String source, + String dest) +{ + private static final ColumnDesc sourcesColumn = new ColumnDesc<>("source", ColumnType.TXTSTRING, StorageType.GZIP); + private static final ColumnDesc destsColumn = new ColumnDesc<>("dest", ColumnType.TXTSTRING, StorageType.GZIP); + + public static Reader reader(Path baseDir, int page) throws IOException { + return new Reader(baseDir, page); + } + + public static class Reader implements AutoCloseable { + private final StringColumnReader sourcesReader; + private final StringColumnReader destsReader; + + public Reader(SlopPageRef page) throws IOException { + this(page.baseDir(), page.page()); + } + + public Reader(Path baseDir, int page) throws IOException { + sourcesReader = sourcesColumn.forPage(page).open(baseDir); + destsReader = destsColumn.forPage(page).open(baseDir); + } + + + @Override + public void close() throws IOException { + sourcesReader.close(); + destsReader.close(); + } + + public boolean hasMore() throws IOException { + return sourcesReader.hasRemaining(); + } + + public void forEach(Consumer recordConsumer) throws IOException { + while (hasMore()) { + recordConsumer.accept(next()); + } + } + + public SlopDomainLinkRecord next() throws IOException { + + return new SlopDomainLinkRecord( + sourcesReader.get(), + destsReader.get() + ); + } + } + + public static class Writer implements AutoCloseable { + private final StringColumnWriter sourcesWriter; + private final StringColumnWriter destsWriter; + + public Writer(Path baseDir, int page) throws IOException { + sourcesWriter = sourcesColumn.forPage(page).create(baseDir); + destsWriter = destsColumn.forPage(page).create(baseDir); + } + + public void write(SlopDomainLinkRecord record) throws IOException { + sourcesWriter.put(record.source()); + destsWriter.put(record.dest()); + } + + @Override + public void close() throws IOException { + sourcesWriter.close(); + destsWriter.close(); + } + } +} diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java new file mode 100644 index 00000000..059a6e81 --- /dev/null +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java @@ -0,0 +1,240 @@ +package nu.marginalia.model.processed; + +import nu.marginalia.slop.column.primitive.IntColumnReader; +import nu.marginalia.slop.column.primitive.IntColumnWriter; +import nu.marginalia.slop.column.string.StringColumnReader; +import nu.marginalia.slop.column.string.StringColumnWriter; +import nu.marginalia.slop.desc.ColumnDesc; +import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.StorageType; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Consumer; + +public record SlopDomainRecord( + String domain, + int knownUrls, + int goodUrls, + int visitedUrls, + String state, + String redirectDomain, + String ip, + List rssFeeds) +{ + + public record DomainWithIpProjection( + String domain, + String ip) + {} + + private static final ColumnDesc domainsColumn = new ColumnDesc<>("domain", ColumnType.TXTSTRING, StorageType.GZIP); + private static final ColumnDesc statesColumn = new ColumnDesc<>("state", ColumnType.ENUM_LE, StorageType.PLAIN); + private static final ColumnDesc redirectDomainsColumn = new ColumnDesc<>("redirectDomain", ColumnType.TXTSTRING, StorageType.GZIP); + private static final ColumnDesc ipColumn = new ColumnDesc<>("ip", ColumnType.TXTSTRING, StorageType.GZIP); + + private static final ColumnDesc knownUrlsColumn = new ColumnDesc<>("knownUrls", ColumnType.INT_LE, StorageType.PLAIN); + private static final ColumnDesc goodUrlsColumn = new ColumnDesc<>("goodUrls", ColumnType.INT_LE, StorageType.PLAIN); + private static final ColumnDesc visitedUrlsColumn = new ColumnDesc<>("visitedUrls", ColumnType.INT_LE, StorageType.PLAIN); + + private static final ColumnDesc rssFeedsCountColumn = new ColumnDesc<>("rssFeeds", ColumnType.INT_LE, StorageType.GZIP); + private static final ColumnDesc rssFeedsColumn = new ColumnDesc<>("rssFeeds", ColumnType.TXTSTRING, StorageType.GZIP); + + + public static class DomainNameReader implements AutoCloseable { + private final StringColumnReader domainsReader; + + public DomainNameReader(SlopPageRef page) throws IOException { + this(page.baseDir(), page.page()); + } + + public DomainNameReader(Path baseDir, int page) throws IOException { + domainsReader = domainsColumn.forPage(page).open(baseDir); + } + + + @Override + public void close() throws IOException { + domainsReader.close(); + } + + public boolean hasMore() throws IOException { + return domainsReader.hasRemaining(); + } + + public String next() throws IOException { + return domainsReader.get(); + } + } + + public static class DomainWithIpReader implements AutoCloseable { + private final StringColumnReader domainsReader; + private final StringColumnReader ipReader; + + public DomainWithIpReader(SlopPageRef page) throws IOException { + this(page.baseDir(), page.page()); + } + + public DomainWithIpReader(Path baseDir, int page) throws IOException { + domainsReader = domainsColumn.forPage(page).open(baseDir); + ipReader = ipColumn.forPage(page).open(baseDir); + } + + + @Override + public void close() throws IOException { + domainsReader.close(); + ipReader.close(); + } + + public boolean hasMore() throws IOException { + return domainsReader.hasRemaining(); + } + + public DomainWithIpProjection next() throws IOException { + + return new DomainWithIpProjection( + domainsReader.get(), + ipReader.get() + ); + } + } + + public static class Reader implements AutoCloseable { + private final StringColumnReader domainsReader; + private final StringColumnReader statesReader; + private final StringColumnReader redirectReader; + private final StringColumnReader ipReader; + + private final IntColumnReader knownUrlsReader; + private final IntColumnReader goodUrlsReader; + private final IntColumnReader visitedUrlsReader; + + private final IntColumnReader rssFeedsCountReader; + private final StringColumnReader rssFeedsReader; + + public Reader(SlopPageRef page) throws IOException { + this(page.baseDir(), page.page()); + } + + public Reader(Path baseDir, int page) throws IOException { + domainsReader = domainsColumn.forPage(page).open(baseDir); + statesReader = statesColumn.forPage(page).open(baseDir); + redirectReader = redirectDomainsColumn.forPage(page).open(baseDir); + ipReader = ipColumn.forPage(page).open(baseDir); + + knownUrlsReader = knownUrlsColumn.forPage(page).open(baseDir); + goodUrlsReader = goodUrlsColumn.forPage(page).open(baseDir); + visitedUrlsReader = visitedUrlsColumn.forPage(page).open(baseDir); + + rssFeedsCountReader = rssFeedsCountColumn.forPage(page).open(baseDir); + rssFeedsReader = rssFeedsColumn.forPage(page).open(baseDir); + } + + + @Override + public void close() throws IOException { + domainsReader.close(); + statesReader.close(); + redirectReader.close(); + ipReader.close(); + + knownUrlsReader.close(); + goodUrlsReader.close(); + visitedUrlsReader.close(); + + rssFeedsCountReader.close(); + rssFeedsReader.close(); + } + + public boolean hasMore() throws IOException { + return domainsReader.hasRemaining(); + } + + public void forEach(Consumer recordConsumer) throws IOException { + while (hasMore()) { + recordConsumer.accept(next()); + } + } + + public SlopDomainRecord next() throws IOException { + List rssFeeds = new ArrayList<>(); + int rssFeedsCount = rssFeedsCountReader.get(); + for (int i = 0; i < rssFeedsCount; i++) { + rssFeeds.add(rssFeedsReader.get()); + } + + return new SlopDomainRecord( + domainsReader.get(), + knownUrlsReader.get(), + goodUrlsReader.get(), + visitedUrlsReader.get(), + statesReader.get(), + redirectReader.get(), + ipReader.get(), + rssFeeds + ); + } + } + + public static class Writer implements AutoCloseable { + private final StringColumnWriter domainsWriter; + private final StringColumnWriter statesWriter; + private final StringColumnWriter redirectWriter; + private final StringColumnWriter ipWriter; + + private final IntColumnWriter knownUrlsWriter; + private final IntColumnWriter goodUrlsWriter; + private final IntColumnWriter visitedUrlsWriter; + + private final IntColumnWriter rssFeedsCountWriter; + private final StringColumnWriter rssFeedsWriter; + + public Writer(Path baseDir, int page) throws IOException { + domainsWriter = domainsColumn.forPage(page).create(baseDir); + statesWriter = statesColumn.forPage(page).create(baseDir); + redirectWriter = redirectDomainsColumn.forPage(page).create(baseDir); + ipWriter = ipColumn.forPage(page).create(baseDir); + + knownUrlsWriter = knownUrlsColumn.forPage(page).create(baseDir); + goodUrlsWriter = goodUrlsColumn.forPage(page).create(baseDir); + visitedUrlsWriter = visitedUrlsColumn.forPage(page).create(baseDir); + + rssFeedsCountWriter = rssFeedsCountColumn.forPage(page).create(baseDir); + rssFeedsWriter = rssFeedsColumn.forPage(page).create(baseDir); + } + + public void write(SlopDomainRecord record) throws IOException { + domainsWriter.put(record.domain()); + statesWriter.put(record.state()); + redirectWriter.put(record.redirectDomain()); + ipWriter.put(record.ip()); + + knownUrlsWriter.put(record.knownUrls()); + goodUrlsWriter.put(record.goodUrls()); + visitedUrlsWriter.put(record.visitedUrls()); + + rssFeedsCountWriter.put(record.rssFeeds().size()); + for (String rssFeed : record.rssFeeds()) { + rssFeedsWriter.put(rssFeed); + } + } + + @Override + public void close() throws IOException { + domainsWriter.close(); + statesWriter.close(); + redirectWriter.close(); + ipWriter.close(); + + knownUrlsWriter.close(); + goodUrlsWriter.close(); + visitedUrlsWriter.close(); + + rssFeedsCountWriter.close(); + rssFeedsWriter.close(); + } + } +} diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopPageRef.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopPageRef.java new file mode 100644 index 00000000..fb349621 --- /dev/null +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopPageRef.java @@ -0,0 +1,6 @@ +package nu.marginalia.model.processed; + +import java.nio.file.Path; + +public record SlopPageRef(Path baseDir, int page) { +} diff --git a/code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLog.java b/code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLog.java similarity index 100% rename from code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLog.java rename to code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLog.java diff --git a/code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLogImpl.java b/code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLogImpl.java similarity index 100% rename from code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLogImpl.java rename to code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLogImpl.java diff --git a/code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLogInspector.java b/code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLogInspector.java similarity index 100% rename from code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLogInspector.java rename to code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLogInspector.java diff --git a/code/process-models/processed-data/readme.md b/code/processes/converting-process/model/readme.md similarity index 100% rename from code/process-models/processed-data/readme.md rename to code/processes/converting-process/model/readme.md diff --git a/code/process-models/work-log/test/nu/marginalia/worklog/BatchingWorkLogImplTest.java b/code/processes/converting-process/model/test/nu/marginalia/worklog/BatchingWorkLogImplTest.java similarity index 100% rename from code/process-models/work-log/test/nu/marginalia/worklog/BatchingWorkLogImplTest.java rename to code/processes/converting-process/model/test/nu/marginalia/worklog/BatchingWorkLogImplTest.java diff --git a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java index 61de3c38..06b839eb 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -3,21 +3,21 @@ package nu.marginalia.converting; import com.google.inject.Guice; import com.google.inject.Injector; -import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; +import nu.marginalia.model.html.HtmlStandard; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.io.*; +import java.io.IOException; import java.nio.file.Path; import java.time.LocalTime; import java.util.*; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 85651501..0e935276 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -12,14 +12,14 @@ import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.junit.jupiter.api.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 5105543d..4fdea7d8 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -29,12 +29,12 @@ dependencies { implementation project(':code:common:service') implementation project(':code:libraries:blocking-thread-pool') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:libraries:message-queue') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:easy-lsh') - implementation project(':code:process-models:crawling-model') - implementation project(':code:process-models:crawl-spec') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:crawling-process:model') implementation project(':code:features-convert:anchor-keywords') diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index 5173af75..cd83edc5 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; @@ -19,22 +20,21 @@ import nu.marginalia.crawl.spec.DbCrawlSpecProvider; import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider; import nu.marginalia.crawl.warc.WarcArchiverFactory; import nu.marginalia.crawl.warc.WarcArchiverIf; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.CrawlerOutputFile; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.crawlspec.CrawlSpecFileNames; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.CrawlerOutputFile; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.service.ProcessMainClass; -import nu.marginalia.storage.FileStorageService; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; +import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; -import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.storage.FileStorageService; import nu.marginalia.util.SimpleBlockingThreadPool; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; @@ -47,8 +47,12 @@ import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.security.Security; import java.sql.SQLException; -import java.util.*; -import java.util.concurrent.*; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 65e1529b..8b34cb77 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -2,9 +2,9 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.lsh.EasyLSH; +import nu.marginalia.model.crawldata.CrawledDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java index 37f84d58..c7fee792 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java @@ -1,9 +1,9 @@ package nu.marginalia.crawl.retreival; -import nu.marginalia.crawling.body.HttpFetchResult; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; import java.time.LocalDateTime; import java.util.Objects; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 45ec5b4b..81fbca89 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -5,16 +5,17 @@ import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; -import nu.marginalia.link_parser.LinkParser; -import nu.marginalia.crawling.model.*; import nu.marginalia.ip_blocklist.UrlBlocklist; +import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.CrawlerDomainStatus; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.jsoup.Jsoup; import org.slf4j.Logger; @@ -24,7 +25,9 @@ import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; public class CrawlerRetreiver implements AutoCloseable { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java index ab1ce5ef..1468d6ed 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -1,9 +1,9 @@ package nu.marginalia.crawl.retreival; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.HttpFetchResult; import org.jsoup.Jsoup; import org.netpreserve.jwarc.*; import org.slf4j.Logger; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java index 57147aec..3ec9b8da 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java @@ -4,10 +4,10 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.crawl.retreival.fetcher.FetchResultState; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawling.model.CrawlerDomainStatus; import nu.marginalia.ip_blocklist.IpBlockList; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawlerDomainStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java index 96e2eaa7..c9997017 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java @@ -1,7 +1,7 @@ package nu.marginalia.crawl.retreival.fetcher; -import nu.marginalia.crawling.body.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.ContentTypeLogic; import okhttp3.OkHttpClient; import okhttp3.Request; import org.slf4j.Logger; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 70576510..a2015e8f 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -3,10 +3,10 @@ package nu.marginalia.crawl.retreival.fetcher; import com.google.inject.ImplementedBy; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.retreival.RateLimitException; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; import java.util.List; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 1df0301b..f4be6b7f 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -11,12 +11,12 @@ import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeR import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.body.ContentTypeLogic; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.ContentTypeLogic; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.HttpFetchResult; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; import okhttp3.OkHttpClient; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index 180811cf..1d4a4372 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -2,10 +2,10 @@ package nu.marginalia.crawl.retreival.fetcher.warc; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; import okhttp3.OkHttpClient; import okhttp3.Request; import org.netpreserve.jwarc.*; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index e88ee454..50a9b111 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -8,9 +8,9 @@ import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainCrawlFrontier; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.body.HttpFetchResult; -import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDocument; import org.jsoup.Jsoup; /** This class encapsulates the logic for re-visiting a domain that has already been crawled. diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index c604ff5b..b5589401 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -2,10 +2,10 @@ package nu.marginalia.crawl.retreival.revisit; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.DocumentBodyResult; -import nu.marginalia.crawling.body.HttpFetchResult; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.DocumentBodyResult; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDocument; import javax.annotation.Nullable; diff --git a/code/process-models/crawling-model/build.gradle b/code/processes/crawling-process/model/build.gradle similarity index 93% rename from code/process-models/crawling-model/build.gradle rename to code/processes/crawling-process/model/build.gradle index 2a24d8bf..5e4879d1 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/processes/crawling-process/model/build.gradle @@ -12,6 +12,8 @@ java { } } +jar.archiveBaseName = 'crawling-process-model' + apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { @@ -30,6 +32,7 @@ dependencies { implementation libs.notnull implementation libs.bundles.parquet + implementation libs.trove implementation libs.jwarc implementation libs.gson implementation libs.commons.io diff --git a/code/process-models/crawl-spec/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java b/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java rename to code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java diff --git a/code/process-models/crawl-spec/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java b/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java rename to code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java similarity index 86% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java index 3f8123b2..272ebf3b 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java @@ -1,8 +1,9 @@ -package nu.marginalia.crawling.io; +package nu.marginalia.io.crawldata; -import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; -import java.io.*; +import java.io.FileNotFoundException; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawlerOutputFile.java similarity index 98% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawlerOutputFile.java index 05c4797e..266a7f24 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawlerOutputFile.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.io; +package nu.marginalia.io.crawldata; import org.apache.logging.log4j.util.Strings; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java similarity index 94% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java index ce01ebce..1ade3836 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java @@ -1,6 +1,6 @@ -package nu.marginalia.crawling.io; +package nu.marginalia.io.crawldata; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.model.crawldata.SerializableCrawlData; import org.jetbrains.annotations.Nullable; import java.io.IOException; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java similarity index 95% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java index e676e351..55c5ce8e 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java @@ -1,14 +1,14 @@ -package nu.marginalia.crawling.io.format; +package nu.marginalia.io.crawldata.format; import lombok.SneakyThrows; import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.DocumentBodyToString; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.*; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.*; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/process-models/crawl-spec/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java diff --git a/code/process-models/crawl-spec/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java similarity index 98% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java index 25d4c8ec..c38bcb3b 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.body; +package nu.marginalia.model.body; import nu.marginalia.contenttype.ContentType; import nu.marginalia.model.EdgeUrl; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java similarity index 96% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java index 7c8f471c..ebd3d33e 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java @@ -1,9 +1,9 @@ -package nu.marginalia.crawling.body; +package nu.marginalia.model.body; import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.ContentTypeParser; import nu.marginalia.contenttype.DocumentBodyToString; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyResult.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java similarity index 95% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyResult.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java index 04e3fedb..a29e7093 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyResult.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java @@ -1,7 +1,7 @@ -package nu.marginalia.crawling.body; +package nu.marginalia.model.body; import nu.marginalia.contenttype.ContentType; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; import java.util.Optional; import java.util.function.BiFunction; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/HttpFetchResult.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java similarity index 99% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/body/HttpFetchResult.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java index 6bafaf5c..d3fd41b0 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/HttpFetchResult.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java @@ -1,11 +1,11 @@ -package nu.marginalia.crawling.body; +package nu.marginalia.model.body; import nu.marginalia.contenttype.ContentType; import okhttp3.Headers; import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.netpreserve.jwarc.MessageHeaders; import org.netpreserve.jwarc.WarcResponse; -import org.jsoup.nodes.Document; import java.io.ByteArrayInputStream; import java.io.IOException; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java similarity index 98% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java index c809682a..f43433b9 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java similarity index 94% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java index adb59bda..3cb1ea51 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDocumentStatus.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDocumentStatus.java similarity index 80% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDocumentStatus.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDocumentStatus.java index 2369bcc6..d796c6de 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDocumentStatus.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDocumentStatus.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; public enum CrawlerDocumentStatus { OK, diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDomainStatus.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDomainStatus.java similarity index 64% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDomainStatus.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDomainStatus.java index 12a31c52..4efc9c59 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDomainStatus.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDomainStatus.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; public enum CrawlerDomainStatus { OK, ERROR, BLOCKED, REDIRECT diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java similarity index 63% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java index 01ecaf8d..58d25dea 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; public interface SerializableCrawlData { String getDomain(); diff --git a/code/process-models/crawl-spec/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java similarity index 97% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java rename to code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java index 55deafdb..e4ce7ad9 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.parquet; +package nu.marginalia.parquet.crawldata; import blue.strategic.parquet.Dehydrator; import blue.strategic.parquet.Hydrator; @@ -12,7 +12,7 @@ import org.apache.parquet.schema.Types; import java.time.Instant; -import static org.apache.parquet.schema.LogicalTypeAnnotation.*; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; @AllArgsConstructor diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileReader.java similarity index 97% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java rename to code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileReader.java index 362eb561..6e4ea942 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileReader.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.parquet; +package nu.marginalia.parquet.crawldata; import blue.strategic.parquet.Hydrator; import blue.strategic.parquet.HydratorSupplier; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java similarity index 97% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java rename to code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java index 539ff28d..36a58673 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java @@ -1,10 +1,10 @@ -package nu.marginalia.crawling.parquet; +package nu.marginalia.parquet.crawldata; import blue.strategic.parquet.ParquetWriter; import nu.marginalia.UserAgent; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.DocumentBodyResult; -import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.DocumentBodyResult; +import nu.marginalia.model.body.HttpFetchResult; import org.apache.commons.lang3.StringUtils; import org.netpreserve.jwarc.*; import org.slf4j.Logger; diff --git a/code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java b/code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java similarity index 100% rename from code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java rename to code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java diff --git a/code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXEntityRefused.java b/code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXEntityRefused.java similarity index 100% rename from code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXEntityRefused.java rename to code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXEntityRefused.java diff --git a/code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXResponseReference.java b/code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXResponseReference.java similarity index 100% rename from code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXResponseReference.java rename to code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXResponseReference.java diff --git a/code/process-models/crawling-model/readme.md b/code/processes/crawling-process/model/readme.md similarity index 100% rename from code/process-models/crawling-model/readme.md rename to code/processes/crawling-process/model/readme.md diff --git a/code/process-models/crawling-model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java b/code/processes/crawling-process/model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java similarity index 94% rename from code/process-models/crawling-model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java rename to code/processes/crawling-process/model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java index 8612fd39..fdfe52a4 100644 --- a/code/process-models/crawling-model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java +++ b/code/processes/crawling-process/model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java @@ -1,8 +1,10 @@ package nu.marginalia.crawling.model; +import nu.marginalia.model.crawldata.CrawledDocument; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; class CrawledDocumentTest { diff --git a/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java similarity index 90% rename from code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java rename to code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index a0352f29..0da0f6d8 100644 --- a/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -1,9 +1,11 @@ package nu.marginalia.crawling.parquet; -import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentParquetRecordFileWriterTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentParquetRecordFileWriterTest.java index d3369bcc..ebda28e1 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentParquetRecordFileWriterTest.java @@ -1,6 +1,6 @@ package nu.marginalia.crawl.retreival.fetcher; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.junit.jupiter.api.Test; import java.util.List; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index 206bf798..a9df80ac 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -3,15 +3,18 @@ package nu.marginalia.crawl.retreival.fetcher; import nu.marginalia.UserAgent; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import okhttp3.OkHttpClient; import okhttp3.Request; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.netpreserve.jwarc.*; +import org.netpreserve.jwarc.WarcReader; +import org.netpreserve.jwarc.WarcRequest; +import org.netpreserve.jwarc.WarcResponse; +import org.netpreserve.jwarc.WarcXResponseReference; import java.io.IOException; import java.net.URISyntaxException; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java index e711c81c..9d46ec75 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java @@ -2,7 +2,7 @@ package nu.marginalia.crawl.retreival.revisit; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDocument; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java index 0873924f..63d5aa27 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java @@ -4,11 +4,11 @@ import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.body.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.ContentTypeLogic; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.DocumentBodyResult; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 749b821c..43040313 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -5,13 +5,13 @@ import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.*; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; +import nu.marginalia.model.crawldata.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.test.CommonTestData; import okhttp3.Headers; @@ -23,7 +23,10 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; public class CrawlerMockFetcherTest { diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index aa1f00e7..803ba983 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -8,15 +8,15 @@ import nu.marginalia.crawl.retreival.*; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.*; import org.netpreserve.jwarc.*; diff --git a/code/processes/index-constructor-process/build.gradle b/code/processes/index-constructor-process/build.gradle index 4653133a..6de7e773 100644 --- a/code/processes/index-constructor-process/build.gradle +++ b/code/processes/index-constructor-process/build.gradle @@ -21,7 +21,7 @@ tasks.distZip.enabled = false apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:common:process') implementation project(':code:common:service') implementation project(':code:common:db') diff --git a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java index 4f7e9d90..6c55db6c 100644 --- a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java +++ b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java @@ -11,7 +11,7 @@ import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.mq.MessageQueueFactory; @@ -119,7 +119,6 @@ public class IndexConstructorMain extends ProcessMainClass { outputFileDocs, outputFileWords, outputFilePositions, - IndexJournalReader::singleFile, this::addRankToIdEncoding, tmpDir); @@ -138,7 +137,6 @@ public class IndexConstructorMain extends ProcessMainClass { var constructor = new PrioIndexConstructor( outputFileDocs, outputFileWords, - (path) -> IndexJournalReader.singleFile(path).filtering(r -> r != 0), this::addRankToIdEncoding, tmpDir); @@ -148,13 +146,16 @@ public class IndexConstructorMain extends ProcessMainClass { private void createForwardIndex() throws IOException { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); ForwardIndexConverter converter = new ForwardIndexConverter(heartbeat, - IndexJournalReader.paging(workDir), outputFileDocsId, outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), domainRankings ); diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index 90b00d87..5e49ed30 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -21,7 +21,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:common:process') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:index:api') implementation project(':code:common:model') implementation project(':code:common:db') @@ -36,9 +36,8 @@ dependencies { implementation project(':third-party:parquet-floor') testImplementation project(':code:services-application:search-service') - implementation project(':code:process-models:crawling-model') - implementation project(':code:process-models:processed-data') - implementation project(':code:process-models:work-log') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:converting-process:model') implementation project(':code:features-convert:keyword-extraction') implementation project(':code:functions:link-graph:partition') diff --git a/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java b/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java index f523f8e7..08c016db 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java @@ -4,65 +4,59 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; -import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.index.journal.IndexJournalFileNames; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Files; +import java.nio.file.Path; @Singleton public class LoaderIndexJournalWriter { - private final IndexJournalWriter indexWriter; private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class); + private final Path journalPath; - private final long[] buffer = new long[65536]; - + private IndexJournalSlopWriter currentWriter = null; + private long recordsWritten = 0; + private int page; @Inject public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException { var indexArea = IndexLocations.getIndexConstructionArea(fileStorageService); - var existingIndexFiles = IndexJournalFileNames.findJournalFiles(indexArea); - for (var existingFile : existingIndexFiles) { - Files.delete(existingFile); + journalPath = IndexJournal.allocateName(indexArea); + page = IndexJournal.numPages(journalPath); + + switchToNextVersion(); + + logger.info("Creating Journal Writer {}", indexArea); + } + + private void switchToNextVersion() throws IOException { + if (currentWriter != null) { + currentWriter.close(); } - indexWriter = new IndexJournalWriterPagingImpl(indexArea); + currentWriter = new IndexJournalSlopWriter(journalPath, page++); } @SneakyThrows - public void putWords(long combinedId, - int features, - long metadata, - int length, - DocumentKeywords wordSet) { - - if (wordSet.isEmpty()) { - logger.info("Skipping zero-length word set for {}", combinedId); - return; + public void putWords(long header, SlopDocumentRecord.KeywordsProjection data) + { + if (++recordsWritten > 200_000) { + recordsWritten = 0; + switchToNextVersion(); } - if (combinedId <= 0) { - logger.warn("Bad ID: {}", combinedId); - return; - } - - var header = new IndexJournalEntryHeader(combinedId, features, length, metadata); - var data = new IndexJournalEntryData(wordSet.keywords, wordSet.metadata, wordSet.positions); - - indexWriter.put(header, data); + currentWriter.put(header, data); } - public void close() throws Exception { - indexWriter.close(); + public void close() throws IOException { + currentWriter.close(); } } diff --git a/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java b/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java index 21f878f0..7dda3e05 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java @@ -1,6 +1,10 @@ package nu.marginalia.loading; import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.model.processed.SlopDomainLinkRecord; +import nu.marginalia.model.processed.SlopDomainRecord; +import nu.marginalia.model.processed.SlopPageRef; import nu.marginalia.worklog.BatchingWorkLogInspector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,26 +43,32 @@ public class LoaderInputData { lastGoodBatch.put(singleSource, lastBatch); } - public Collection listDomainFiles() { - List pathsAll = new ArrayList<>(); + public Collection> listDomainPages() { + List> pathsAll = new ArrayList<>(); for (var source : sourceDirectories) { - pathsAll.addAll(ProcessedDataFileNames.listDomainFiles(source, lastGoodBatch.get(source))); + for (int i = 0; i < lastGoodBatch.get(source); i++) { + pathsAll.add(new SlopPageRef<>(ProcessedDataFileNames.domainFileName(source), i)); + } } return pathsAll; } - public Collection listDomainLinkFiles() { - List pathsAll = new ArrayList<>(); + public Collection> listDomainLinkPages() { + List> pathsAll = new ArrayList<>(); for (var source : sourceDirectories) { - pathsAll.addAll(ProcessedDataFileNames.listDomainLinkFiles(source, lastGoodBatch.get(source))); + for (int i = 0; i < lastGoodBatch.get(source); i++) { + pathsAll.add(new SlopPageRef<>(ProcessedDataFileNames.domainLinkFileName(source), i)); + } } return pathsAll; } - public Collection listDocumentFiles() { - List pathsAll = new ArrayList<>(); + public Collection> listDocumentFiles() { + List> pathsAll = new ArrayList<>(); for (var source : sourceDirectories) { - pathsAll.addAll(ProcessedDataFileNames.listDocumentFiles(source, lastGoodBatch.get(source))); + for (int i = 0; i < lastGoodBatch.get(source); i++) { + pathsAll.add(new SlopPageRef<>(ProcessedDataFileNames.documentFileName(source), i)); + } } return pathsAll; } diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java index 7cc9b522..e254d51e 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java @@ -3,22 +3,22 @@ package nu.marginalia.loading.documents; import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; -import nu.marginalia.io.processed.DocumentRecordParquetFileReader; import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.processed.DocumentRecordMetadataProjection; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.model.processed.SlopPageRef; import nu.marginalia.process.control.ProcessHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; import java.sql.SQLException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; @Singleton @@ -38,18 +38,24 @@ public class DocumentLoaderService { LoaderInputData inputData) throws IOException, SQLException { - var documentFiles = inputData.listDocumentFiles(); + Collection> pageRefs = inputData.listDocumentFiles(); try (var taskHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("DOCUMENTS")) { int processed = 0; - for (var file : documentFiles) { - taskHeartbeat.progress("LOAD", processed++, documentFiles.size()); + for (var pageRef : pageRefs) { + taskHeartbeat.progress("LOAD", processed++, pageRefs.size()); - loadDocumentsFromFile(domainIdRegistry, file); + try (var reader = new SlopDocumentRecord.MetadataReader(pageRef); + LinkdbLoader loader = new LinkdbLoader(domainIdRegistry)) + { + while (reader.hasNext()) { + loader.accept(reader.next()); + } + } } - taskHeartbeat.progress("LOAD", processed, documentFiles.size()); + taskHeartbeat.progress("LOAD", processed, pageRefs.size()); } catch (IOException e) { logger.error("Failed to load documents", e); throw e; @@ -60,19 +66,6 @@ public class DocumentLoaderService { return true; } - private void loadDocumentsFromFile(DomainIdRegistry domainIdRegistry, Path file) - throws SQLException, IOException - { - try (var stream = DocumentRecordParquetFileReader.streamMetadataProjection(file); - LinkdbLoader loader = new LinkdbLoader(domainIdRegistry) - ) - { - logger.info("Loading document meta from {}", file); - - stream.forEach(loader::accept); - } - } - class LinkdbLoader implements AutoCloseable { private final DomainIdRegistry domainIdRegistry; private final List details = new ArrayList<>(1000); @@ -82,25 +75,25 @@ public class DocumentLoaderService { } @SneakyThrows - public void accept(DocumentRecordMetadataProjection projection) + public void accept(SlopDocumentRecord.MetadataProjection projection) { long urlId = UrlIdCodec.encodeId( - domainIdRegistry.getDomainId(projection.domain), - projection.ordinal + domainIdRegistry.getDomainId(projection.domain()), + projection.ordinal() ); - details.add(new DocdbUrlDetail( + documentDbWriter.add(new DocdbUrlDetail( urlId, - new EdgeUrl(projection.url), - projection.title, - projection.description, - projection.quality, - projection.htmlStandard, - projection.htmlFeatures, - projection.pubYear, - projection.hash, - projection.getLength() + new EdgeUrl(projection.url()), + projection.title(), + projection.description(), + projection.quality(), + projection.htmlStandard(), + projection.htmlFeatures(), + projection.pubYear(), + projection.hash(), + projection.length() )); if (details.size() > 100) { diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java index dc325b2b..5188c06b 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java @@ -2,20 +2,18 @@ package nu.marginalia.loading.documents; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.io.processed.DocumentRecordParquetFileReader; -import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.loading.LoaderIndexJournalWriter; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.processed.DocumentRecordKeywordsProjection; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.model.processed.SlopPageRef; import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.sequence.CodedSequence; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; +import java.util.Collection; @Singleton public class KeywordLoaderService { @@ -32,57 +30,41 @@ public class KeywordLoaderService { LoaderInputData inputData) throws IOException { try (var task = heartbeat.createAdHocTaskHeartbeat("KEYWORDS")) { - var documentFiles = inputData.listDocumentFiles(); + Collection> documentFiles = inputData.listDocumentFiles(); int processed = 0; - for (var file : documentFiles) { + for (SlopPageRef pageRef : documentFiles) { task.progress("LOAD", processed++, documentFiles.size()); - loadKeywordsFromFile(domainIdRegistry, file); + try (var keywordsReader = new SlopDocumentRecord.KeywordsProjectionReader(pageRef)) { + logger.info("Loading keywords from {}", pageRef); + + while (keywordsReader.hasMore()) { + var projection = keywordsReader.next(); + + long combinedId = UrlIdCodec.encodeId( + domainIdRegistry.getDomainId(projection.domain()), + projection.ordinal()); + + writer.putWords(combinedId, projection); + } + } } task.progress("LOAD", processed, documentFiles.size()); } + catch (IOException e) { + logger.error("Failed to load keywords", e); + throw e; + } logger.info("Finished"); return true; } - private void loadKeywordsFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException { - try (var stream = DocumentRecordParquetFileReader.streamKeywordsProjection(file)) { - logger.info("Loading keywords from {}", file); - stream.filter(DocumentRecordKeywordsProjection::hasKeywords) - .forEach(proj -> insertKeywords(domainIdRegistry, proj)); - } - } - - private void insertKeywords(DomainIdRegistry domainIdRegistry, - DocumentRecordKeywordsProjection projection) - { - long combinedId = UrlIdCodec.encodeId( - domainIdRegistry.getDomainId(projection.domain), - projection.ordinal); - - var words = new DocumentKeywords( - projection.words.toArray(String[]::new), - projection.metas.toArray(), - projection.positions.toArray(CodedSequence[]::new) - ); - - writer.putWords(combinedId, - projection.htmlFeatures, - projection.documentMetadata, - projection.length, - words); - } - - public void close() { - try { - writer.close(); - } catch (Exception e) { - logger.error("Failed to close writer", e); - } + public void close() throws IOException { + writer.close(); } } \ No newline at end of file diff --git a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java index 342645dd..ac1fc763 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java @@ -4,12 +4,11 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.ProcessConfiguration; -import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; -import nu.marginalia.io.processed.DomainRecordParquetFileReader; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.processed.DomainRecord; -import nu.marginalia.model.processed.DomainWithIp; +import nu.marginalia.model.processed.SlopDomainLinkRecord; +import nu.marginalia.model.processed.SlopDomainRecord; +import nu.marginalia.model.processed.SlopPageRef; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeatImpl; import org.slf4j.Logger; @@ -57,44 +56,61 @@ public class DomainLoaderService { try (var conn = dataSource.getConnection(); var taskHeartbeat = heartbeat.createProcessTaskHeartbeat(Steps.class, "DOMAIN_IDS"); var selectStmt = conn.prepareStatement(""" - SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=? + SELECT ID, LOWER(DOMAIN_NAME) FROM EC_DOMAIN """) ) { taskHeartbeat.progress(Steps.PREP_DATA); - try (var inserter = new DomainInserter(conn, nodeId)) { - for (var domainWithIp : readBasicDomainInformation(inputData)) { - inserter.accept(new EdgeDomain(domainWithIp.domain)); - domainNamesAll.add(domainWithIp.domain); + // Add domain names from this data set with the current node affinity + for (SlopPageRef page : inputData.listDomainPages()) { + + try (var inserter = new DomainInserter(conn, nodeId); + var reader = new SlopDomainRecord.DomainNameReader(page) + ) { + while (reader.hasMore()) { + String domainName = reader.next(); + inserter.accept(new EdgeDomain(domainName)); + domainNamesAll.add(domainName); + } } } - try (var inserter = new DomainInserter(conn, -1)) { - for (var domain : readReferencedDomainNames(inputData)) { - inserter.accept(new EdgeDomain(domain)); - domainNamesAll.add(domain); + + // Add linked domains, but with -1 affinity meaning they can be grabbed by any index node + for (SlopPageRef page : inputData.listDomainLinkPages()) { + try (var inserter = new DomainInserter(conn, -1); + var reader = new SlopDomainLinkRecord.Reader(page)) { + while (reader.hasMore()) { + SlopDomainLinkRecord record = reader.next(); + inserter.accept(new EdgeDomain(record.dest())); + domainNamesAll.add(record.dest()); + } } } taskHeartbeat.progress(Steps.INSERT_NEW); - try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId)) { - for (var domainWithIp : readBasicDomainInformation(inputData)) { - updater.accept(new EdgeDomain(domainWithIp.domain), domainWithIp.ip); + // Update the node affinity and IP address for each domain + for (SlopPageRef page : inputData.listDomainPages()) { + try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId); + var reader = new SlopDomainRecord.DomainWithIpReader(page) + ) { + while (reader.hasMore()) { + var domainWithIp = reader.next(); + updater.accept(new EdgeDomain(domainWithIp.domain()), domainWithIp.ip()); + } } } taskHeartbeat.progress(Steps.FETCH_ALL); - selectStmt.setFetchSize(1000); - for (var domain : domainNamesAll) { - selectStmt.setString(1, domain); - var rs = selectStmt.executeQuery(); - if (rs.next()) { + + var rs = selectStmt.executeQuery(); + while (rs.next()) { + String domain = rs.getString(2); + + if (domainNamesAll.contains(domain)) { ret.add(domain, rs.getInt(1)); } - else { - logger.error("Unknown domain {}", domain); - } } taskHeartbeat.progress(Steps.DONE); @@ -103,46 +119,23 @@ public class DomainLoaderService { return ret; } - Collection readBasicDomainInformation(LoaderInputData inputData) throws IOException { - final Set domainsAll = new HashSet<>(100_000); - - var domainFiles = inputData.listDomainFiles(); - for (var file : domainFiles) { - domainsAll.addAll(DomainRecordParquetFileReader.getBasicDomainInformation(file)); - } - - return domainsAll; - } - - Collection readReferencedDomainNames(LoaderInputData inputData) throws IOException { - final Set domainNamesAll = new HashSet<>(100_000); - - var linkFiles = inputData.listDomainLinkFiles(); - for (var file : linkFiles) { - domainNamesAll.addAll(DomainLinkRecordParquetFileReader.getDestDomainNames(file)); - } - - return domainNamesAll; - } - public boolean loadDomainMetadata(DomainIdRegistry domainIdRegistry, ProcessHeartbeat heartbeat, LoaderInputData inputData) { - var files = inputData.listDomainFiles(); - try (var taskHeartbeat = heartbeat.createAdHocTaskHeartbeat("UPDATE-META")) { int processed = 0; - for (var file : files) { - taskHeartbeat.progress("UPDATE-META", processed++, files.size()); + Collection> pages = inputData.listDomainPages(); + for (var page : pages) { + taskHeartbeat.progress("UPDATE-META", processed++, pages.size()); - try (var stream = DomainRecordParquetFileReader.stream(file); - var updater = new DomainMetadataUpdater(dataSource, domainIdRegistry) - ) { - stream.forEach(updater::accept); + try (var reader = new SlopDomainRecord.Reader(page); + var updater = new DomainMetadataUpdater(dataSource, domainIdRegistry)) + { + reader.forEach(updater::accept); } } - taskHeartbeat.progress("UPDATE-META", processed, files.size()); + taskHeartbeat.progress("UPDATE-META", processed, pages.size()); } catch (Exception ex) { logger.error("Failed inserting metadata!", ex); @@ -239,12 +232,12 @@ public class DomainLoaderService { """); } - public void accept(DomainRecord domainRecord) { + public void accept(SlopDomainRecord domainRecord) { try { - updateStatement.setInt(1, idRegistry.getDomainId(domainRecord.domain)); - updateStatement.setInt(2, domainRecord.visitedUrls); - updateStatement.setInt(3, domainRecord.goodUrls); - updateStatement.setInt(4, domainRecord.knownUrls); + updateStatement.setInt(1, idRegistry.getDomainId(domainRecord.domain())); + updateStatement.setInt(2, domainRecord.visitedUrls()); + updateStatement.setInt(3, domainRecord.goodUrls()); + updateStatement.setInt(4, domainRecord.knownUrls()); updateStatement.addBatch(); if (++i > 1000) { diff --git a/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java index 9d0a5384..790e80a3 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java @@ -3,17 +3,17 @@ package nu.marginalia.loading.links; import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; -import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; import nu.marginalia.linkgraph.io.DomainLinksWriter; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; -import nu.marginalia.model.processed.DomainLinkRecord; +import nu.marginalia.model.processed.SlopDomainLinkRecord; +import nu.marginalia.model.processed.SlopPageRef; import nu.marginalia.process.control.ProcessHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; +import java.util.Collection; @Singleton public class DomainLinksLoaderService { @@ -32,17 +32,17 @@ public class DomainLinksLoaderService { LoaderInputData inputData) throws IOException { try (var task = heartbeat.createAdHocTaskHeartbeat("LINKS")) { - var linkFiles = inputData.listDomainLinkFiles(); + Collection> pageRefs = inputData.listDomainLinkPages(); int processed = 0; - for (var file : linkFiles) { - task.progress("LOAD", processed++, linkFiles.size()); + for (var pageRef : pageRefs) { + task.progress("LOAD", processed++, pageRefs.size()); - loadLinksFromFile(domainIdRegistry, file); + loadLinksFromFile(domainIdRegistry, pageRef); } - task.progress("LOAD", processed, linkFiles.size()); + task.progress("LOAD", processed, pageRefs.size()); } catch (IOException e) { logger.error("Failed to load links", e); @@ -53,12 +53,13 @@ public class DomainLinksLoaderService { return true; } - private void loadLinksFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException { - try (var domainStream = DomainLinkRecordParquetFileReader.stream(file); + private void loadLinksFromFile(DomainIdRegistry domainIdRegistry, SlopPageRef pageRef) throws IOException { + try (var domainLinkReader = new SlopDomainLinkRecord.Reader(pageRef); var linkLoader = new LinkLoader(domainIdRegistry)) { - logger.info("Loading links from {}", file); - domainStream.forEach(linkLoader::accept); + logger.info("Loading links from {}:{}", pageRef.baseDir(), pageRef.page()); + + domainLinkReader.forEach(linkLoader::accept); } } @@ -70,10 +71,10 @@ public class DomainLinksLoaderService { } @SneakyThrows - void accept(DomainLinkRecord record) { + void accept(SlopDomainLinkRecord record) { domainLinkDbWriter.write( - domainIdRegistry.getDomainId(record.source), - domainIdRegistry.getDomainId(record.dest) + domainIdRegistry.getDomainId(record.source()), + domainIdRegistry.getDomainId(record.dest()) ); } diff --git a/code/processes/loading-process/test/nu/marginalia/loading/domains/DomainLoaderServiceTest.java b/code/processes/loading-process/test/nu/marginalia/loading/domains/DomainLoaderServiceTest.java deleted file mode 100644 index fda0e9b6..00000000 --- a/code/processes/loading-process/test/nu/marginalia/loading/domains/DomainLoaderServiceTest.java +++ /dev/null @@ -1,102 +0,0 @@ -package nu.marginalia.loading.domains; - -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; -import nu.marginalia.io.processed.DomainRecordParquetFileWriter; -import nu.marginalia.io.processed.ProcessedDataFileNames; -import nu.marginalia.loading.LoaderInputData; -import nu.marginalia.model.processed.DomainLinkRecord; -import nu.marginalia.model.processed.DomainRecord; -import nu.marginalia.process.control.ProcessAdHocTaskHeartbeat; -import nu.marginalia.process.control.ProcessHeartbeat; -import org.junit.jupiter.api.*; -import org.mockito.Mockito; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.*; -import java.util.stream.Collectors; - -import static org.junit.jupiter.api.Assertions.*; - -@Tag("slow") -@Testcontainers -class DomainLoaderServiceTest { - List toDelete = new ArrayList<>(); - ProcessHeartbeat heartbeat; - - @BeforeEach - public void setUp() { - heartbeat = Mockito.mock(ProcessHeartbeat.class); - - Mockito.when(heartbeat.createAdHocTaskHeartbeat(Mockito.anyString())).thenReturn( - Mockito.mock(ProcessAdHocTaskHeartbeat.class) - ); - } - - @AfterEach - public void tearDown() throws IOException { - for (var path : Lists.reverse(toDelete)) { - Files.deleteIfExists(path); - } - - toDelete.clear(); - } - - @Test - void readDomainNames() throws IOException { - Path workDir = Files.createTempDirectory(getClass().getSimpleName()); - Path parquetFile1 = ProcessedDataFileNames.domainFileName(workDir, 0); - Path parquetFile2 = ProcessedDataFileNames.domainFileName(workDir, 1); - Path parquetFile3 = ProcessedDataFileNames.domainLinkFileName(workDir, 0); - - toDelete.add(workDir); - toDelete.add(parquetFile1); - toDelete.add(parquetFile2); - toDelete.add(parquetFile3); - - // Prep by creating two parquet files with domains - // and one with domain links - - List domains1 = List.of("www.marginalia.nu", "memex.marginalia.nu", "search.marginalia.nu"); - List domains2 = List.of("wiby.me", "www.mojeek.com", "www.altavista.com"); - List linkDomains = List.of("maya.land", "xkcd.com", "aaronsw.com"); - - try (var pw = new DomainRecordParquetFileWriter(parquetFile1)) { - for (var domain : domains1) { - pw.write(dr(domain)); - } - } - try (var pw = new DomainRecordParquetFileWriter(parquetFile2)) { - for (var domain : domains2) { - pw.write(dr(domain)); - } - } - try (var pw = new DomainLinkRecordParquetFileWriter(parquetFile3)) { - for (var domain : linkDomains) { - pw.write(dl(domain)); - } - } - // Read them - var domainService = new DomainLoaderService(null, new ProcessConfiguration("test", 1, UUID.randomUUID())); - - // Verify - Set expectedDomains1 = Sets.union(new HashSet<>(domains1), new HashSet<>(domains2)); - assertEquals(expectedDomains1, domainService.readBasicDomainInformation(new LoaderInputData(workDir, 2)).stream().map(d -> d.domain).collect(Collectors.toSet())); - - Set expectedDomains2 = new HashSet<>(linkDomains); - assertEquals(expectedDomains2, domainService.readReferencedDomainNames(new LoaderInputData(workDir, 2))); - } - - private DomainRecord dr(String domainName) { - return new DomainRecord(domainName, 0, 0, 0, null, null, null, null); - } - - private DomainLinkRecord dl(String destDomainName) { - return new DomainLinkRecord("www.marginalia.nu", destDomainName); - } -} \ No newline at end of file diff --git a/code/process-mqapi/build.gradle b/code/processes/process-mq-api/build.gradle similarity index 91% rename from code/process-mqapi/build.gradle rename to code/processes/process-mq-api/build.gradle index 339c52c8..b6881432 100644 --- a/code/process-mqapi/build.gradle +++ b/code/processes/process-mq-api/build.gradle @@ -11,6 +11,8 @@ java { } } +jar.archiveBaseName = 'process-mqapi' + apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/ProcessInboxNames.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/ProcessInboxNames.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/ProcessInboxNames.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/ProcessInboxNames.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/converting/ConvertAction.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/converting/ConvertAction.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/converting/ConvertAction.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/converting/ConvertAction.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/converting/ConvertRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/converting/ConvertRequest.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/converting/ConvertRequest.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/converting/ConvertRequest.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/CrawlRequest.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/CrawlRequest.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/index/CreateIndexRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/index/CreateIndexRequest.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/index/CreateIndexRequest.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/index/CreateIndexRequest.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/index/IndexName.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/index/IndexName.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/index/IndexName.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/index/IndexName.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/loading/LoadRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/loading/LoadRequest.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/loading/LoadRequest.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/loading/LoadRequest.java diff --git a/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java b/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java index e979b86f..95145de3 100644 --- a/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java +++ b/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java @@ -8,9 +8,9 @@ import nu.marginalia.api.model.ApiSearchResults; import nu.marginalia.api.searchquery.QueryClient; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; -import nu.marginalia.api.searchquery.model.results.*; +import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.index.query.limit.QueryLimits; -import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.model.idx.WordFlags; import java.util.ArrayList; import java.util.Comparator; @@ -77,14 +77,8 @@ public class ApiSearchOperator { if (url.rawIndexResult != null) { List lst = new ArrayList<>(); for (var entry : url.rawIndexResult.keywordScores) { - var metadata = new WordMetadata(entry.encodedWordMetadata()); - - // Skip terms that don't appear anywhere - if (metadata.isEmpty()) - continue; - - Set flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet()); - lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags)); + Set flags = WordFlags.decode(entry.flags).stream().map(Object::toString).collect(Collectors.toSet()); + lst.add(new ApiSearchResultQueryDetails(entry.keyword, entry.positionCount, flags)); } details.add(lst); diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java index faba9eb7..be1f4c2a 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -38,9 +38,6 @@ public class ClusteredUrlDetails implements Comparable { for (var keywordScore : urlDetail.resultItem.keywordScores) { if (keywordScore.isKeywordSpecial()) continue; - if (keywordScore.positions() == 0) - continue; - if (keywordScore.hasTermFlag(WordFlags.Title)) return false; if (keywordScore.hasTermFlag(WordFlags.ExternalLink)) diff --git a/code/services-core/control-service/build.gradle b/code/services-core/control-service/build.gradle index 2830bd5f..f326801d 100644 --- a/code/services-core/control-service/build.gradle +++ b/code/services-core/control-service/build.gradle @@ -35,12 +35,12 @@ dependencies { implementation project(':code:functions:search-query:api') implementation project(':code:execution:api') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:features-search:screenshots') implementation project(':code:index:index-journal') implementation project(':code:index:query') - implementation project(':code:process-models:crawl-spec') + implementation project(':code:processes:crawling-process:model') implementation libs.bundles.slf4j diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index 7693083b..74696bf3 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -45,15 +45,15 @@ dependencies { implementation project(':code:functions:link-graph:api') - implementation project(':code:process-models:crawl-spec') - implementation project(':code:process-models:crawling-model') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:crawling-process:model') implementation project(':code:features-crawl:link-parser') implementation project(':code:features-convert:data-extractors') implementation project(':code:features-convert:stackexchange-xml') implementation project(':code:features-convert:reddit-json') implementation project(':code:index:index-journal') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:execution') implementation project(':code:execution:api') diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index 17be5cb4..2aea9f76 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -32,7 +32,7 @@ dependencies { implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') implementation project(':code:processes:converting-process') - implementation project(':code:process-models:crawling-model') + implementation project(':code:processes:crawling-process:model') implementation project(':third-party:commons-codec') implementation project(':code:features-crawl:link-parser') diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/Experiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/Experiment.java index b5f9ff40..1797c1d6 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/Experiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/Experiment.java @@ -1,6 +1,6 @@ package nu.marginalia.tools; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import java.io.IOException; import java.util.HashSet; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java index 668a25a9..a7879747 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -3,14 +3,15 @@ package nu.marginalia.tools; import com.google.inject.Guice; import com.google.inject.Injector; import nu.marginalia.converting.ConverterModule; -import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.io.crawldata.CrawledDomainReader; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.tools.experiments.*; import java.io.IOException; import java.nio.file.Path; -import java.util.*; +import java.util.Arrays; +import java.util.Map; public class ExperimentRunnerMain { diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/LegacyExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/LegacyExperiment.java index 5d7d8d11..effb216f 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/LegacyExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/LegacyExperiment.java @@ -1,8 +1,8 @@ package nu.marginalia.tools; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; import java.io.IOException; import java.util.ArrayList; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java index 70856439..60cb6938 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java @@ -3,8 +3,8 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.converting.processor.DocumentProcessor; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AtagsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AtagsExperiment.java index d08ec90f..4f63f564 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AtagsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AtagsExperiment.java @@ -7,9 +7,9 @@ import nu.marginalia.ProcessConfiguration; import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; import java.sql.SQLException; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java index 8290a658..4a34a31c 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java @@ -3,7 +3,7 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.plugin.specialization.BlogSpecialization; -import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java index f602a837..1d49536f 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java @@ -3,11 +3,11 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; import gnu.trove.set.hash.TLongHashSet; import lombok.SneakyThrows; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.tools.Experiment; import org.jsoup.Jsoup; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index f83196e5..579aaa2e 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -4,11 +4,10 @@ import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.segmentation.NgramLexicon; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java index 0afb290f..d69b1bda 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java @@ -3,7 +3,7 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeUrl; import nu.marginalia.tools.Experiment; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java index 521b36e8..436b227d 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java @@ -1,6 +1,6 @@ package nu.marginalia.tools.experiments; -import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; public class TestExperiment extends LegacyExperiment { diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java index 0d99356a..ad2be0bb 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java @@ -4,8 +4,8 @@ import com.google.inject.Inject; import nu.marginalia.WmsaHome; import nu.marginalia.adblock.GoogleAnwersSpamDetector; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; import nu.marginalia.topic.RecipeDetector; import nu.marginalia.topic.TextileCraftDetector; diff --git a/code/tools/integration-test/build.gradle b/code/tools/integration-test/build.gradle index f4623a45..81e3cde9 100644 --- a/code/tools/integration-test/build.gradle +++ b/code/tools/integration-test/build.gradle @@ -17,10 +17,12 @@ dependencies { implementation project(':code:processes:crawling-process') implementation project(':code:processes:converting-process') implementation project(':code:processes:loading-process') - implementation project(':code:process-models:crawling-model') - implementation project(':code:process-models:processed-data') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:converting-process:model') implementation project(':code:processes:index-constructor-process') implementation project(':code:index') + implementation project(':code:libraries:array') + implementation project(':code:libraries:btree') implementation project(':code:functions:search-query:api') implementation project(':code:index:index-reverse') implementation project(':code:index:index-forward') @@ -43,6 +45,8 @@ dependencies { implementation libs.guice implementation libs.fastutil implementation libs.trove + testImplementation libs.bundles.junit + testImplementation project(':code:libraries:test-helpers') } diff --git a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java index 7f75409d..7ec8841b 100644 --- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java +++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java @@ -11,8 +11,6 @@ import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.functions.searchquery.QueryFactory; import nu.marginalia.index.IndexGrpcService; import nu.marginalia.index.ReverseIndexFullFileNames; @@ -23,9 +21,10 @@ import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.searchset.SearchSetAny; +import nu.marginalia.io.crawldata.CrawledDomainReader; import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.loading.LoaderIndexJournalWriter; @@ -37,9 +36,9 @@ import nu.marginalia.loading.links.DomainLinksLoaderService; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.test.IntegrationTestModule; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; @@ -53,9 +52,7 @@ import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.List; -import static nu.marginalia.index.journal.reader.IndexJournalReader.FILE_HEADER_SIZE_BYTES; import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; -import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.when; @@ -179,14 +176,6 @@ public class IntegrationTest { documentDbWriter.close(); keywordLoaderService.close(); - Path journalFile = fileStorageService - .getStorageBase(FileStorageBaseType.CURRENT) - .asPath() - .resolve("iw/page-index-0000.dat"); - - assertTrue(Files.exists(journalFile), "Journal file not found: " + journalFile); - assertTrue(Files.size(journalFile) > FILE_HEADER_SIZE_BYTES, "Journal file does not contain data"); - /** CONSTRUCT INDEX */ createForwardIndex(); @@ -248,7 +237,6 @@ public class IntegrationTest { outputFileDocs, outputFileWords, outputFilePositions, - IndexJournalReader::singleFile, this::addRankToIdEncoding, tmpDir); @@ -267,7 +255,6 @@ public class IntegrationTest { var constructor = new PrioIndexConstructor( outputFileDocs, outputFileWords, - (path) -> IndexJournalReader.singleFile(path).filtering(r -> r != 0), this::addRankToIdEncoding, tmpDir); @@ -278,12 +265,14 @@ public class IntegrationTest { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); ForwardIndexConverter converter = new ForwardIndexConverter(new FakeProcessHeartbeat(), - IndexJournalReader.paging(workDir), outputFileDocsId, outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), domainRankings ); diff --git a/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java b/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java index 69b94ee8..83f79fbf 100644 --- a/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java +++ b/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java @@ -12,8 +12,7 @@ import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; import nu.marginalia.db.DomainTypes; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.searchset.SearchSetAny; import nu.marginalia.index.searchset.SearchSetsService; import nu.marginalia.linkdb.docs.DocumentDbReader; @@ -100,8 +99,9 @@ public class IntegrationTestModule extends AbstractModule { bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); - bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterPagingImpl( - IndexLocations.getIndexConstructionArea(fileStorageServiceMock) + bind(IndexJournalSlopWriter.class).toInstance(new IndexJournalSlopWriter( + IndexLocations.getIndexConstructionArea(fileStorageServiceMock), + 0 )); bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( diff --git a/settings.gradle b/settings.gradle index b62fba21..78ec0028 100644 --- a/settings.gradle +++ b/settings.gradle @@ -70,7 +70,7 @@ include 'code:features-crawl:crawl-blocklist' include 'code:features-crawl:link-parser' include 'code:features-crawl:content-type' -include 'code:process-mqapi' +include 'code:processes:process-mq-api' include 'code:common:db' include 'code:common:linkdb' @@ -81,17 +81,16 @@ include 'code:common:renderer' include 'code:common:process' include 'code:processes:converting-process' +include 'code:processes:converting-process:model' + include 'code:processes:crawling-process' +include 'code:processes:crawling-process:model' + include 'code:processes:loading-process' include 'code:processes:index-constructor-process' include 'code:processes:test-data' include 'code:processes:website-adjacencies-calculator' -include 'code:process-models:crawling-model' -include 'code:process-models:work-log' -include 'code:process-models:crawl-spec' -include 'code:process-models:processed-data' - include 'code:tools:experiment-runner' include 'code:tools:screenshot-capture-tool' include 'code:tools:load-test'