From 082c9cc308c6597373f3f7979a1acada2e0c1b39 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 16 Jun 2022 14:01:49 +0200 Subject: [PATCH 01/40] Fixing typo on front page. (cherry picked from commit 5ef953ae3dd797179d582db0c32d68f9bdef8fe3) --- marginalia_nu/src/main/resources/static/edge/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/resources/static/edge/index.html b/marginalia_nu/src/main/resources/static/edge/index.html index 166e67b8..47d6e314 100644 --- a/marginalia_nu/src/main/resources/static/edge/index.html +++ b/marginalia_nu/src/main/resources/static/edge/index.html @@ -88,7 +88,7 @@ theology, the occult, knitting, - compter science, + computer science, or art.

From 2e55599850b692de2844c5cdd9f80fc876df9e28 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 16 Jun 2022 14:09:57 +0200 Subject: [PATCH 02/40] Revert "Revert "Merge branch 'experimental' into master"" This reverts commit 81c77e7fcb2c5d31f13841462910003d745783e9. --- marginalia_nu/build.gradle | 31 +- .../wmsa/edge/EdgeSearchE2ETest.java | 23 +- marginalia_nu/src/e2e/resources/init.sh | 2 +- .../nu/marginalia/util/btree/BTreeWriter.java | 16 +- .../marginalia/util/btree/WriteCallback.java | 4 +- .../util/btree/model/BTreeHeader.java | 3 +- .../marginalia/util/hash/LongPairHashMap.java | 47 +- .../util/multimap/MultimapFileLong.java | 11 +- .../multimap/MultimapFileLongOffsetSlice.java | 70 +++ .../util/multimap/MultimapFileLongSlice.java | 29 + .../util/multimap/MultimapSearcher.java | 4 +- .../util/multimap/MultimapSorter.java | 4 +- .../marginalia/util/ranking/AcademiaRank.java | 49 -- .../util/ranking/BetterReversePageRank.java | 8 +- .../util/ranking/BetterStandardPageRank.java | 8 +- .../util/ranking/BuggyReversePageRank.java | 8 +- .../util/ranking/BuggyStandardPageRank.java | 8 +- .../util/ranking/RankingAlgorithm.java | 277 +++------ .../util/ranking/RankingDomainData.java | 33 + .../util/ranking/RankingDomainFetcher.java | 105 ++++ .../ranking/old/OldReversePageRankV2.java | 4 +- .../util/ranking/old/StandardPageRank.java | 4 +- .../util/ranking/tool/DedupTool.java | 2 +- .../util/ranking/tool/PerusePageRankV2.java | 4 +- .../ranking/tool/TestAcademiaRankTool.java | 30 - .../ranking/tool/UpdateDomainRanksTool.java | 14 +- .../ranking/tool/UpdateDomainRanksTool2.java | 12 +- .../edge/converting/ReindexTriggerMain.java | 4 +- .../converting/interpreter/Interpreter.java | 2 +- .../instruction/LoadProcessedDomain.java | 4 +- .../wmsa/edge/converting/loader/Loader.java | 6 +- .../converting/loader/SqlLoadDomainLinks.java | 6 +- .../converting/loader/SqlLoadDomains.java | 27 +- .../loader/SqlLoadProcessedDocument.java | 24 +- .../loader/SqlLoadProcessedDomain.java | 27 +- .../edge/converting/loader/SqlLoadUrls.java | 20 +- .../processor/InstructionsCompiler.java | 2 +- .../edge/crawling/CrawlJobExtractorMain.java | 11 +- .../CrawlJobExtractorPageRankMain.java | 15 +- .../wmsa/edge/data/dao/EdgeDataStoreDao.java | 32 +- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 588 ++---------------- .../dao/task/EdgeDomainBlacklistImpl.java | 2 +- .../index/{radix => }/EdgeIndexBucket.java | 10 +- .../wmsa/edge/index/EdgeIndexControl.java | 9 +- .../wmsa/edge/index/EdgeIndexService.java | 6 +- .../wmsa/edge/index/IndexServicesFactory.java | 16 +- .../ConversionUnnecessaryException.java | 2 +- .../SearchEngineRanking.java | 2 +- .../SearchIndexConverter.java | 108 ++-- .../SearchIndexDao.java | 43 +- .../SearchIndexPartitioner.java | 4 +- .../SearchIndexPreconverter.java | 3 +- .../words/WordIndexLengthsTable.java | 10 + .../words/WordIndexOffsetsTable.java | 67 ++ .../conversion/words/WordIndexTables.java | 56 ++ .../conversion/words/WordsTableWriter.java | 75 +++ .../dictionary/DictionaryReader.java | 2 +- .../dictionary/DictionaryWriter.java | 2 +- .../dictionary/TokenCompressor.java | 2 +- .../index => journal}/SearchIndexWriter.java | 2 +- .../SearchIndexWriterImpl.java | 4 +- .../IndexWordsTable.java} | 90 ++- .../index => reader}/SearchIndex.java | 6 +- .../index => reader}/SearchIndexReader.java | 10 +- .../{service => reader}/SearchIndexes.java | 10 +- .../query/IndexQueryBuilder.java | 4 +- .../query/IndexSearchBudget.java | 2 +- .../{service => reader}/query/Query.java | 2 +- .../wmsa/edge/index/service/SearchOrder.java | 6 - .../index/wordstable/IndexWordsTable.java | 48 -- .../index/wordstable/WordsTableWriter.java | 85 --- .../wmsa/edge/model/EdgeDomain.java | 5 +- .../model/crawl/EdgeDomainIndexingState.java | 31 +- .../model/search/EdgeSearchSpecification.java | 4 +- .../edge/model/search/EdgeUrlDetails.java | 17 +- .../wmsa/edge/search/EdgeSearchOperator.java | 3 +- .../wmsa/edge/search/EdgeSearchProfile.java | 17 +- .../command/commands/SiteSearchCommand.java | 4 +- .../edge/search/model/DomainInformation.java | 1 - .../wmsa/edge/search/query/QueryFactory.java | 1 - .../search/results/SearchResultDecorator.java | 2 +- .../siteinfo/DomainInformationService.java | 226 ++++++- .../wmsa/edge/tools/IndexMergerMain.java | 9 +- .../main/resources/sql/edge-crawler-cache.sql | 176 ++---- .../templates/edge/site-info-gmi.hdb | 1 - .../resources/templates/edge/site-info.hdb | 1 - .../java/nu/marginalia/util/TestUtil.java | 2 +- .../util/btree/BTreeWriterTest.java | 26 +- .../util/hash/LongPairHashMapTest.java | 4 +- .../loader/SqlLoadDomainLinksTest.java | 48 ++ .../converting/loader/SqlLoadDomainsTest.java | 52 ++ .../loader/SqlLoadProcessedDocumentTest.java | 94 +++ .../loader/SqlLoadProcessedDomainTest.java | 54 ++ .../converting/loader/SqlLoadUrlsTest.java | 50 ++ .../index/service/DictionaryWriterTest.java | 8 +- .../index/service/EdgeIndexClientTest.java | 6 +- .../service/SearchIndexConverterTest.java | 89 --- .../index/service/SearchIndexWriterTest.java | 14 +- .../index/service/TokenCompressorTest.java | 2 +- .../edge/search/query/QueryVariantsTest.java | 5 +- 100 files changed, 1564 insertions(+), 1654 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{radix => }/EdgeIndexBucket.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => conversion}/ConversionUnnecessaryException.java (80%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => conversion}/SearchEngineRanking.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => conversion}/SearchIndexConverter.java (75%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => conversion}/SearchIndexDao.java (64%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/query => conversion}/SearchIndexPartitioner.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => conversion}/SearchIndexPreconverter.java (97%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => }/dictionary/DictionaryReader.java (92%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => }/dictionary/DictionaryWriter.java (99%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => }/dictionary/TokenCompressor.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => journal}/SearchIndexWriter.java (88%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => journal}/SearchIndexWriterImpl.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index/wordstable/BtreeWordsTable.java => reader/IndexWordsTable.java} (58%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => reader}/SearchIndex.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => reader}/SearchIndexReader.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/SearchIndexes.java (91%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/query/IndexQueryBuilder.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/query/IndexSearchBudget.java (87%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/query/Query.java (73%) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index b2115fb0..eb553649 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -59,12 +59,12 @@ dependencies { implementation "com.sparkjava:spark-core:2.9.3" implementation 'com.opencsv:opencsv:5.6' - implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' + implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2' implementation 'org.slf4j:slf4j-api:1.7.36' @@ -76,7 +76,6 @@ dependencies { implementation 'com.github.ThatJavaNerd:JRAW:1.1.0' implementation group: 'com.h2database', name: 'h2', version: '2.1.210' - testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.3.1' implementation 'org.jsoup:jsoup:1.14.3' implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2' @@ -86,7 +85,7 @@ dependencies { implementation 'com.zaxxer:HikariCP:5.0.1' - implementation 'org.apache.opennlp:opennlp-tools:1.9.3' + implementation 'org.apache.opennlp:opennlp-tools:1.9.4' implementation 'io.prometheus:simpleclient:0.15.0' implementation 'io.prometheus:simpleclient_servlet:0.15.0' implementation 'io.prometheus:simpleclient_httpserver:0.15.0' @@ -123,15 +122,19 @@ dependencies { testImplementation 'org.projectlombok:lombok:1.18.24' testAnnotationProcessor 'org.projectlombok:lombok:1.18.24' + testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1' + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2') + testImplementation 'org.testcontainers:mariadb:1.17.2' + testImplementation "org.testcontainers:junit-jupiter:1.17.2" + e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' e2eTestImplementation 'org.projectlombok:lombok:1.18.24' - e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22' - e2eTestImplementation 'org.testcontainers:mariadb:1.17.1' - e2eTestImplementation 'org.testcontainers:nginx:1.17.1' - e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1' - e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1" - e2eTestImplementation "org.testcontainers:selenium:1.17.1" + e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24' + e2eTestImplementation 'org.testcontainers:nginx:1.17.2' + e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2" + e2eTestImplementation "org.testcontainers:selenium:1.17.2" e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4' e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4' } diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index af43e462..08408de2 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -28,6 +28,7 @@ import java.util.ArrayList; import java.util.List; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; +import static org.junit.jupiter.api.Assertions.assertEquals; @Tag("e2e") @Testcontainers @@ -156,6 +157,16 @@ public class EdgeSearchE2ETest extends E2ETestBase { return wikipediaFiles.toString(); } + private List getTitlesFromSearchResults(String html) { + List ret = new ArrayList<>(); + + for (var title : Jsoup.parse(html).select(".card.search-result > h2")) { + ret.add(title.text()); + } + + return ret; + } + @Test public void testFrontPage() throws IOException { var driver = chrome.getWebDriver(); @@ -173,8 +184,9 @@ public class EdgeSearchE2ETest extends E2ETestBase { driver.get("http://proxyNginx/search?query=bird&profile=corpo"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); + assertEquals(List.of("Bird"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query")); } @@ -187,20 +199,23 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info")); } + @Test public void testSiteSearch() throws IOException { var driver = chrome.getWebDriver(); driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); + + assertEquals(List.of("Frog", "Binomial nomenclature", "Amphibian", "Mantis"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search")); } + @Test public void testBrowse() throws IOException { var driver = chrome.getWebDriver(); @@ -209,7 +224,6 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse")); } @Test @@ -220,7 +234,6 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define")); } @Test diff --git a/marginalia_nu/src/e2e/resources/init.sh b/marginalia_nu/src/e2e/resources/init.sh index 5409f787..50dbd406 100644 --- a/marginalia_nu/src/e2e/resources/init.sh +++ b/marginalia_nu/src/e2e/resources/init.sh @@ -69,4 +69,4 @@ memex memex dating dating EOF -WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file +WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java index 28ac4914..b43faca7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java @@ -3,6 +3,7 @@ package nu.marginalia.util.btree; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -12,9 +13,9 @@ import java.io.IOException; public class BTreeWriter { private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class); private final BTreeContext ctx; - private final MultimapFileLong map; + private final MultimapFileLongSlice map; - public BTreeWriter(MultimapFileLong map, BTreeContext ctx) { + public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) { this.map = map; this.ctx = ctx; } @@ -31,13 +32,18 @@ public class BTreeWriter { return size; } - public long write(long offset, int numEntries, WriteCallback writeIndex) + /** Construct a BTree with numEntries entries at offset in the associated map + * + * @return The size of the written data + */ + public long write(long offset, int numEntries, WriteCallback writeIndexCallback) throws IOException { - var header = makeHeader(offset, numEntries); + BTreeHeader header = makeHeader(offset, numEntries); header.write(map, offset); - writeIndex.write(header.dataOffsetLongs()); + + writeIndexCallback.write(map.atOffset(header.dataOffsetLongs())); if (header.layers() < 1) { return ctx.calculateSize(numEntries); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java index 70bd8132..a6225db1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java @@ -1,7 +1,9 @@ package nu.marginalia.util.btree; +import nu.marginalia.util.multimap.MultimapFileLongSlice; + import java.io.IOException; public interface WriteCallback { - void write(long offset) throws IOException; + void write(MultimapFileLongSlice slice) throws IOException; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java index 4951f5b8..8d68b424 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java @@ -1,6 +1,7 @@ package nu.marginalia.util.btree.model; import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) { public BTreeHeader { @@ -28,7 +29,7 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon return padding; } - public void write(MultimapFileLong dest, long offset) { + public void write(MultimapFileLongSlice dest, long offset) { dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL)); dest.put(offset+1, indexOffsetLongs); dest.put(offset+2, dataOffsetLongs); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java index 6f8912a9..d1e056b9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java @@ -1,9 +1,7 @@ package nu.marginalia.util.hash; -import io.prometheus.client.Gauge; import lombok.EqualsAndHashCode; import lombok.Getter; -import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.PrimeUtil; import org.slf4j.Logger; @@ -17,9 +15,7 @@ import static java.lang.Math.round; */ public class LongPairHashMap { private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class); - private static final Gauge probe_count_metrics - = Gauge.build("wmsa_wordfile_hash_map_probe_count", "Probing Count") - .register(); + private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police private final long hashTableSize; private final MultimapFileLong data; @@ -27,26 +23,37 @@ public class LongPairHashMap { private int sz = 0; private static final int HEADER_SIZE = 2; - public LongPairHashMap(MultimapFileLong data, long size) { + private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) { this.data = data; - // Actually use a prime size for Donald Knuth reasons - hashTableSize = PrimeUtil.nextPrime(size, 1); - maxProbeLength = hashTableSize / 2; + this.hashTableSize = hashTableSize; + this.maxProbeLength = maxProbeLength; + } - logger.debug("Table size = " + hashTableSize); + public static LongPairHashMap createNew(MultimapFileLong data, long size) { + var tableSize = PrimeUtil.nextPrime(size, 1); + var ret = new LongPairHashMap(data, tableSize, tableSize/2); - data.put(0, IndexWordsTable.Strategy.HASH.ordinal()); - data.put(1, hashTableSize); - for (int i = 2; i < hashTableSize; i++) { + data.put(0, MAGIC_WORD); + data.put(1, tableSize); + + for (int i = 2; i < tableSize; i++) { data.put(HEADER_SIZE + 2L*i, 0); } - } - public LongPairHashMap(MultimapFileLong data) { - this.data = data; - hashTableSize = data.get(1); - maxProbeLength = hashTableSize / 10; - logger.debug("Table size = " + hashTableSize); + return ret; + } + + public static LongPairHashMap loadExisting(MultimapFileLong data) { + long key = data.get(0); + + if (key != MAGIC_WORD) { + logger.warn("LongPairHashMap lacks magic word, could this be garbage data?"); + } + + var hashTableSize = data.get(1); + var maxProbeLength = hashTableSize / 10; + + return new LongPairHashMap(data, hashTableSize, maxProbeLength); } public int size() { @@ -91,8 +98,6 @@ public class LongPairHashMap { final var val = getCell(idx); if (!val.isSet()) { - probe_count_metrics.set(j); - return setValue(data, idx); } else if (val.getKey() == data.getKey()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java index dca8248e..f381a977 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java @@ -21,7 +21,7 @@ import static java.nio.channels.FileChannel.MapMode.READ_WRITE; import static nu.marginalia.util.FileSizeUtil.readableSize; -public class MultimapFileLong implements AutoCloseable { +public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { private final ArrayList buffers = new ArrayList<>(); private final ArrayList mappedByteBuffers = new ArrayList<>(); @@ -196,10 +196,12 @@ public class MultimapFileLong implements AutoCloseable { } } + @Override public long size() { return fileLength; } + @Override public void put(long idx, long val) { if (idx >= mappedSize) grow(idx); @@ -214,6 +216,7 @@ public class MultimapFileLong implements AutoCloseable { } } + @Override public long get(long idx) { if (idx >= mappedSize) grow(idx); @@ -229,10 +232,12 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void read(long[] vals, long idx) { read(vals, vals.length, idx); } + @Override public void read(long[] vals, int n, long idx) { if (idx+n >= mappedSize) { grow(idx+n); @@ -257,10 +262,12 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void write(long[] vals, long idx) { write(vals, vals.length, idx); } + @Override public void write(long[] vals, int n, long idx) { if (idx+n >= mappedSize) { grow(idx+n); @@ -285,6 +292,7 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void write(LongBuffer vals, long idx) { int n = vals.limit() - vals.position(); if (idx+n >= mappedSize) { @@ -310,6 +318,7 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException { int length = (int)(sourceEnd - sourceStart); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java new file mode 100644 index 00000000..c2630ddc --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java @@ -0,0 +1,70 @@ +package nu.marginalia.util.multimap; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; + +public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice { + private final long off; + private final MultimapFileLongSlice map; + + public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) { + this.off = off; + this.map = map; + } + + @Override + public long size() { + return map.size() - off; + } + + @Override + public void put(long idx, long val) { + map.put(off+idx, val); + } + + @Override + public long get(long idx) { + return map.get(off+idx); + } + + @Override + public void read(long[] vals, long idx) { + map.read(vals, idx+off); + } + + @Override + public void read(long[] vals, int n, long idx) { + map.read(vals, n, idx+off); + } + + @Override + public void write(long[] vals, long idx) { + map.write(vals, idx+off); + } + + @Override + public void write(long[] vals, int n, long idx) { + map.write(vals, n, idx+off); + } + + @Override + public void write(LongBuffer vals, long idx) { + map.write(vals, idx+off); + } + + @Override + public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) + throws IOException { + map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd); + } + + @Override + public MultimapFileLongSlice atOffset(long off) { + // If we don't override this, the default implementation would build a pyramid of + // MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...))) + // if this is called iteratively (e.g. to walk over a file) + + return new MultimapFileLongOffsetSlice(map, this.off + off); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java new file mode 100644 index 00000000..abf29f51 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java @@ -0,0 +1,29 @@ +package nu.marginalia.util.multimap; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; + +public interface MultimapFileLongSlice { + long size(); + + void put(long idx, long val); + + long get(long idx); + + void read(long[] vals, long idx); + + void read(long[] vals, int n, long idx); + + void write(long[] vals, long idx); + + void write(long[] vals, int n, long idx); + + void write(LongBuffer vals, long idx); + + void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException; + + default MultimapFileLongSlice atOffset(long off) { + return new MultimapFileLongOffsetSlice(this, off); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java index c961ac0e..005888d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java @@ -4,9 +4,9 @@ import lombok.experimental.Delegate; public class MultimapSearcher { @Delegate - private final MultimapFileLong mmf; + private final MultimapFileLongSlice mmf; - public MultimapSearcher(MultimapFileLong mmf) { + public MultimapSearcher(MultimapFileLongSlice mmf) { this.mmf = mmf; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java index 6ca4f64f..61dd04c4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java @@ -13,10 +13,10 @@ import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE; public class MultimapSorter { private final Path tmpFileDir; private final int internalSortLimit; - private final MultimapFileLong multimapFileLong; + private final MultimapFileLongSlice multimapFileLong; private final long[] buffer; - public MultimapSorter(MultimapFileLong multimapFileLong, Path tmpFileDir, int internalSortLimit) { + public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) { this.multimapFileLong = multimapFileLong; this.tmpFileDir = tmpFileDir; this.internalSortLimit = internalSortLimit; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java deleted file mode 100644 index 272a1798..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java +++ /dev/null @@ -1,49 +0,0 @@ -package nu.marginalia.util.ranking; - -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.list.TIntList; -import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.hash.TIntIntHashMap; -import it.unimi.dsi.fastutil.ints.IntArrays; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.sql.SQLException; - -public class AcademiaRank { - private final TIntArrayList result; - private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class); - - public AcademiaRank(HikariDataSource ds, String... origins) throws IOException { - - TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000); - TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000); - - for (int i = 0; i < rankingResults.size(); i++) { - idToRanking.put(rankingResults.get(i), i); - } - - result = new TIntArrayList(10000); - try (var conn = ds.getConnection(); - var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) { - - stmt.setFetchSize(1000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - result.add(rsp.getInt(1)); - } - } - catch (SQLException ex) { - logger.error("SQL error", ex); - } - - int[] internalArray = result.toArray(); - IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b)); - result.set(0, internalArray); - } - - public TIntArrayList getResult() { - return result; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java index f2889ad6..7d3b17c4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java @@ -1,15 +1,11 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BetterReversePageRank extends RankingAlgorithm { - public BetterReversePageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BetterReversePageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java index 5b64fa73..f1f9b0b1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java @@ -1,14 +1,10 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BetterStandardPageRank extends RankingAlgorithm { - public BetterStandardPageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java index 1e87776c..485ba353 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java @@ -1,15 +1,11 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BuggyReversePageRank extends RankingAlgorithm { - public BuggyReversePageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java index a3d7b87e..836bcdfe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java @@ -1,14 +1,10 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BuggyStandardPageRank extends RankingAlgorithm { - public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java index fd76989c..4d255087 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java @@ -1,224 +1,129 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; -import gnu.trove.set.hash.TIntHashSet; import it.unimi.dsi.fastutil.ints.IntComparator; -import lombok.AllArgsConstructor; -import lombok.Data; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.sql.SQLException; import java.util.*; import java.util.function.IntToDoubleFunction; import java.util.stream.IntStream; import it.unimi.dsi.fastutil.ints.IntArrays; public abstract class RankingAlgorithm { - final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); - final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); - final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); + protected final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); + protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); + protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); - private final TIntHashSet spamDomains; - private final HikariDataSource dataSource; - - TIntArrayList[] linkDataSrc2Dest; - TIntArrayList[] linkDataDest2Src; + protected TIntArrayList[] linkDataSrc2Dest; + protected TIntArrayList[] linkDataDest2Src; public final Set originDomains = new HashSet<>(); public final Set originDomainIds = new HashSet<>(); private int maxKnownUrls = Integer.MAX_VALUE; - private static final boolean getNames = true; - private final Logger logger = LoggerFactory.getLogger(getClass()); - public static void main(String... args) throws IOException { - var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com"); - var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); + private final RankingDomainFetcher domains; - var rankVector = spr.pageRankVector(); - var norm = rankVector.norm(); - rpr.pageRank(i -> rankVector.get(i) / norm, 25).forEach(i -> { - System.out.println(spr.domainNameFromId(i)); - return true; - }); - } + public RankingAlgorithm(RankingDomainFetcher domains, String... origins) { + this.domains = domains; - public String domainNameFromId(int id) { - return domainsById.get(id).name; - } - public boolean isPeripheral(int id) { - return domainsById.get(id).peripheral; - } - - public RankingAlgorithm(HikariDataSource dataSource, String... origins) { - this.dataSource = dataSource; - var blacklist = new EdgeDomainBlacklistImpl(dataSource); - - spamDomains = blacklist.getSpamDomains(); originDomains.addAll(Arrays.asList(origins)); - try (var conn = dataSource.getConnection()) { + domains.getDomains(domainData -> { + int id = domainData.id; - String s; - if (getNames) { - s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + domainsById.put(id, domainData); + + domainIndexToId.put(domainIndexToId.size(), id); + domainIdToIndex.put(id, domainIdToIndex.size()); + }); + + linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; + linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; + + domains.eachDomainLink((src, dst) -> { + if (src == dst) return; + + if (domainsById.contains(src) && domainsById.contains(dst)) { + + int srcIdx = domainIdToIndex.get(src); + int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); + + if (linkDataSrc2Dest[srcIdx] == null) { + linkDataSrc2Dest[srcIdx] = new TIntArrayList(); + } + linkDataSrc2Dest[srcIdx].add(dstIdx); + + if (linkDataDest2Src[dstIdx] == null) { + linkDataDest2Src[dstIdx] = new TIntArrayList(); + } + linkDataDest2Src[dstIdx].add(srcIdx); + } + }); + + for (var namePattern : this.originDomains) { + domains.domainsByPattern(namePattern, i -> { + int ival = domainIdToIndex.get(i); + if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) { + originDomainIds.add(ival); } else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + logger.debug("No value for {}", i); } - try (var stmt = conn.prepareStatement(s)) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - if (!spamDomains.contains(id)) { - - domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false)); - - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - } - } - } - - - linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; - linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; - - try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - if (src == dst) continue; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } - } - } - - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) { - for (var seed : this.originDomains) { - stmt.setString(1, seed); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int i = rsp.getInt(1); - int ival = domainIdToIndex.get(i); - if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) { - originDomainIds.add(ival); - } - else { - logger.debug("No value for {}", i); - } - } - logger.debug("{} -> {}", seed, originDomainIds.size()); - } - } - - logger.info("Origin Domains: {}", originDomainIds.size()); - - } catch (SQLException throwables) { - logger.error("SQL error", throwables); + }); } + logger.info("Origin Domains: {}", originDomainIds.size()); } - public void addPeripheralNodes(boolean includeErrorStates) { + public void addPeripheralNodes() { int newNodesIdxCutoff = domainIdToIndex.size(); logger.info("Inserting peripheral nodes"); - try (var conn = dataSource.getConnection()) { - String s; - if (getNames) { - s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + domains.getPeripheralDomains(domainData -> { + int id = domainData.id; + + if (domainsById.put(id, domainData) == null) { // true if id was not already present + domainIndexToId.put(domainIndexToId.size(), id); + domainIdToIndex.put(id, domainIdToIndex.size()); } - else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; - } - try (var stmt = conn.prepareStatement(s)) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); + }); - while (rsp.next()) { - int id = rsp.getInt(1); + linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size()); + linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size()); - if (!spamDomains.contains(id)) { - domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true)); + domains.eachDomainLink((src, dst) -> { + if (src == dst) return; - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - } + if (domainsById.contains(src) && domainsById.contains(dst)) { + int srcIdx = domainIdToIndex.get(src); + int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); + + // This looks like a bug, but it improves the results + if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff) + return; + + if (linkDataSrc2Dest[srcIdx] == null) { + linkDataSrc2Dest[srcIdx] = new TIntArrayList(); } + linkDataSrc2Dest[srcIdx].add(dstIdx); - } - - linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size()); - linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size()); - - try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - if (src == dst) continue; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - // This looks like a bug, but it improves the results - if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff) - continue; - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } + if (linkDataDest2Src[dstIdx] == null) { + linkDataDest2Src[dstIdx] = new TIntArrayList(); } + linkDataDest2Src[dstIdx].add(srcIdx); } - } catch (SQLException throwables) { - logger.error("SQL error", throwables); - } + }); logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size()); } @@ -271,14 +176,14 @@ public abstract class RankingAlgorithm { return rank.getRanking(resultCount); } - public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) { + public TIntList pageRankWithPeripheralNodes(int resultCount) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; for (int i = 0; i < iter_max; i++) { if (i == iter_max-1) { - addPeripheralNodes(includeErrorStates); + addPeripheralNodes(); } RankVector newRank = createNewRankVector(rank); @@ -323,7 +228,7 @@ public abstract class RankingAlgorithm { abstract RankVector createNewRankVector(RankVector rank); - public boolean includeInRanking(DomainData data) { + public boolean includeInRanking(RankingDomainData data) { if (data.isAlias()) return false; if (data.isSpecial()) @@ -445,32 +350,4 @@ public abstract class RankingAlgorithm { } } - @Data - @AllArgsConstructor - static class DomainData { - public final int id; - public final String name; - private int alias; - private int state; - public final int knownUrls; - public boolean peripheral; - - public int resolveAlias() { - if (alias == 0) return id; - return alias; - } - - public boolean isAlias() { - return alias != 0; - } - - public boolean isSpecial() { - return EdgeDomainIndexingState.SPECIAL.code == state; - } - - public boolean isSocialMedia() { - return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state; - } - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java new file mode 100644 index 00000000..c29ed704 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java @@ -0,0 +1,33 @@ +package nu.marginalia.util.ranking; + +import lombok.AllArgsConstructor; +import lombok.Data; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; + +@Data +@AllArgsConstructor +class RankingDomainData { + public final int id; + public final String name; + private int alias; + private EdgeDomainIndexingState state; + public final int knownUrls; + public boolean peripheral; + + public int resolveAlias() { + if (alias == 0) return id; + return alias; + } + + public boolean isAlias() { + return alias != 0; + } + + public boolean isSpecial() { + return EdgeDomainIndexingState.SPECIAL == state; + } + + public boolean isSocialMedia() { + return EdgeDomainIndexingState.SOCIAL_MEDIA == state; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java new file mode 100644 index 00000000..79285a83 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java @@ -0,0 +1,105 @@ +package nu.marginalia.util.ranking; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.function.Consumer; +import java.util.function.IntConsumer; + +public class RankingDomainFetcher { + private final HikariDataSource dataSource; + private final EdgeDomainBlacklistImpl blacklist; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final boolean getNames = false; + + @Inject + public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { + this.dataSource = dataSource; + this.blacklist = blacklist; + } + + public void getDomains(Consumer consumer) { + String query; + if (getNames) { + query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + } + else { + query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + } + + getDomains(query, consumer); + } + + + public void getPeripheralDomains(Consumer consumer) { + String query; + if (getNames) { + query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + } + else { + query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + } + + getDomains(query, consumer); + } + + private void getDomains(String query, Consumer consumer) { + try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) { + stmt.setFetchSize(10000); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + int id = rsp.getInt(1); + if (!blacklist.isBlacklisted(id)) { + consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false)); + } + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domains", ex); + } + } + + public void eachDomainLink(DomainLinkConsumer consumer) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) + { + stmt.setFetchSize(10000); + + var rsp = stmt.executeQuery(); + + while (rsp.next()) { + int src = rsp.getInt(1); + int dst = rsp.getInt(2); + + consumer.accept(src, dst); + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domain links", ex); + } + } + + public void domainsByPattern(String pattern, IntConsumer idConsumer) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) { + stmt.setString(1, pattern); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + idConsumer.accept(rsp.getInt(1)); + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domains by pattern", ex); + } + } + + public interface DomainLinkConsumer { + void accept(int from, int to); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java index 6a214278..02823563 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java @@ -66,7 +66,7 @@ public class OldReversePageRankV2 { originDomains.add("memex.marginalia.nu"); try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) { + try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) { stmt.setFetchSize(10000); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -90,7 +90,7 @@ public class OldReversePageRankV2 { } } - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { stmt.setFetchSize(10000); for (var seed : this.originDomains) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java index c42b28dd..74bef70a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java @@ -48,7 +48,7 @@ public class StandardPageRank { originDomains.addAll(Arrays.asList(origins)); try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) { + try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) { stmt.setFetchSize(10000); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -78,7 +78,7 @@ public class StandardPageRank { } } - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { for (var seed : this.originDomains) { stmt.setString(1, seed); var rsp = stmt.executeQuery(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java index a5ea8b06..d6f95f51 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java @@ -50,7 +50,7 @@ public class DedupTool { Map>> domainToHashToUrl = new HashMap<>(); try (var conn = ds.getConnection(); - var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL"); + var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL"); var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?") ) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java index 85a691c2..3f3ce6a5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java @@ -112,10 +112,10 @@ public class PerusePageRankV2 { try (var conn = dataSource.getConnection()) { String s; if (getNames) { - s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID"; } else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID"; } try (var stmt = conn.prepareStatement(s)) { stmt.setFetchSize(10000); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java deleted file mode 100644 index 38192b35..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.util.ranking.tool; - -import lombok.SneakyThrows; -import nu.marginalia.util.ranking.AcademiaRank; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import org.mariadb.jdbc.Driver; - -import java.io.IOException; - -public class TestAcademiaRankTool { - - @SneakyThrows - public static void main(String... args) { - Driver driver = new Driver(); - var conn = new DatabaseModule().provideConnection(); - - var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu"); - var res = rank.getResult(); - - try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) { - for (int i = 0; i < Math.min(res.size(), 100); i++) { - stmt.setInt(1, res.getQuick(i)); - var rsp = stmt.executeQuery(); - while (rsp.next()) - System.out.println(rsp.getString(1)); - } - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java index 71ec72a6..f80d307f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java @@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BuggyStandardPageRank; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.sql.SQLException; import java.util.HashSet; import java.util.Set; @@ -43,12 +44,14 @@ public class UpdateDomainRanksTool { var uploader = new Thread(() -> uploadThread(conn), "Uploader"); logger.info("Ranking"); - var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu"); rankMax = spr.size()*2; uploader.start(); - spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { + spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { @@ -83,11 +86,6 @@ public class UpdateDomainRanksTool { } } - logger.info("Recalculating quality"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) { - stmt.executeUpdate(); - } - } catch (SQLException | InterruptedException throwables) { throwables.printStackTrace(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java index 336b35fd..f46fb390 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java @@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.sql.SQLException; import java.util.HashSet; import java.util.Set; @@ -45,7 +46,9 @@ public class UpdateDomainRanksTool2 { logger.info("Ranking"); // "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com", // "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net" - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); // var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu"); // var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); @@ -58,7 +61,7 @@ public class UpdateDomainRanksTool2 { rankMax = rpr.size(); - rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { + rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { @@ -94,9 +97,6 @@ public class UpdateDomainRanksTool2 { } logger.info("Recalculating quality"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) { - stmt.executeUpdate(); - } } catch (SQLException | InterruptedException throwables) { throwables.printStackTrace(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java index 050152bc..55648dfd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java @@ -29,7 +29,7 @@ public class ReindexTriggerMain { .build(); try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) { - var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); + var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); while (rs.next()) { System.out.printf("%d %s %s %d\n", rs.getInt(1), @@ -38,7 +38,7 @@ public class ReindexTriggerMain { rs.getInt(4)); } - rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100"); + rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100"); while (rs.next()) { System.out.printf("%d %d %s %d %s\n", rs.getInt(1), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java index 8755716c..c0698dde 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java @@ -14,7 +14,7 @@ public interface Interpreter { void loadRssFeed(EdgeUrl[] rssFeed); void loadDomainLink(DomainLink[] links); - void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality); + void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip); void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java index 065d6211..2b1fd631 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java @@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction { +public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction { @Override public void apply(Interpreter interpreter) { - interpreter.loadProcessedDomain(domain, state, quality); + interpreter.loadProcessedDomain(domain, state, ip); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java index 140a762a..49a39457 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java @@ -76,9 +76,9 @@ public class Loader implements Interpreter { } @Override - public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) { - logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality); - sqlLoadProcessedDomain.load(data, domain, state, quality); + public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) { + logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip); + sqlLoadProcessedDomain.load(data, domain, state, ip); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java index e0978828..6750bd33 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java @@ -30,7 +30,7 @@ public class SqlLoadDomainLinks { INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID) SELECT SOURCE.ID,DEST.ID FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST - ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN; + ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN; END """); } @@ -61,8 +61,8 @@ public class SqlLoadDomainLinks { } } } - catch (SQLException sql) { - sql.printStackTrace(); + catch (SQLException ex) { + logger.warn("SQL error inserting domain links", ex); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java index 18cc40bd..76a839c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java @@ -25,15 +25,9 @@ public class SqlLoadDomains { stmt.execute(""" CREATE PROCEDURE INSERT_DOMAIN ( IN DOMAIN_NAME VARCHAR(255), - IN SUB_DOMAIN VARCHAR(255), IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci) BEGIN - INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN); - - INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID) - SELECT DOMAIN_NAME,SUB_DOMAIN,ID - FROM EC_TOP_DOMAIN - WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN; + INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN); END """); } @@ -46,10 +40,9 @@ public class SqlLoadDomains { public void load(LoaderData data, EdgeDomain domain) { try (var connection = dataSource.getConnection()) { - try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) { + try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) { insertCall.setString(1, domain.toString()); - insertCall.setString(2, domain.subDomain); - insertCall.setString(3, domain.domain); + insertCall.setString(2, domain.domain); insertCall.addBatch(); var ret = insertCall.executeUpdate(); @@ -57,12 +50,11 @@ public class SqlLoadDomains { logger.warn("load({}) -- bad row count {}", domain, ret); } - connection.commit(); findIdForTargetDomain(connection, data); } } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error inserting domain", ex); } @@ -73,12 +65,11 @@ public class SqlLoadDomains { try (var connection = dataSource.getConnection()) { connection.setAutoCommit(false); - try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) { + try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) { for (var domain : domains) { insertCall.setString(1, domain.toString()); - insertCall.setString(2, domain.subDomain); - insertCall.setString(3, domain.domain); + insertCall.setString(2, domain.domain); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -95,7 +86,7 @@ public class SqlLoadDomains { findIdForTargetDomain(connection, data); } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error inserting domains", ex); } } @@ -104,7 +95,7 @@ public class SqlLoadDomains { return; } - try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) + try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { var targetDomain = data.getTargetDomain(); @@ -118,7 +109,7 @@ public class SqlLoadDomains { } } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error finding id for domain", ex); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index b25a657b..e2e25fff 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -31,14 +31,14 @@ public class SqlLoadProcessedDocument { IN TITLE VARCHAR(255), IN DESCRIPTION VARCHAR(255), IN LENGTH INT, - IN QUALITY_MEASURE DOUBLE, IN FEATURES INT, IN STANDARD VARCHAR(32), + IN QUALITY DOUBLE, IN HASH INT) BEGIN SET FOREIGN_KEY_CHECKS=0; - REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES); - UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID; + REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY); + UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; SET FOREIGN_KEY_CHECKS=1; END """); @@ -47,7 +47,8 @@ public class SqlLoadProcessedDocument { IN URL_ID INT, IN STATE VARCHAR(32)) BEGIN - UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID; + UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; + DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID; END """); @@ -61,6 +62,7 @@ public class SqlLoadProcessedDocument { public void load(LoaderData data, List documents) { try (var conn = dataSource.getConnection(); var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) { + conn.setAutoCommit(false); for (var doc : documents) { int urlId = data.getUrlId(doc.url()); @@ -74,9 +76,9 @@ public class SqlLoadProcessedDocument { stmt.setString(3, doc.title()); stmt.setString(4, doc.description()); stmt.setInt(5, doc.length()); - stmt.setDouble(6, doc.quality()); - stmt.setInt(7, doc.htmlFeatures()); - stmt.setString(8, doc.standard().name()); + stmt.setInt(6, doc.htmlFeatures()); + stmt.setString(7, doc.standard().name()); + stmt.setDouble(8, doc.quality()); stmt.setInt(9, (int) doc.hash()); stmt.addBatch(); } @@ -89,8 +91,8 @@ public class SqlLoadProcessedDocument { } conn.commit(); - } catch (SQLException e) { - e.printStackTrace(); + } catch (SQLException ex) { + logger.warn("SQL error inserting document", ex); } @@ -117,8 +119,8 @@ public class SqlLoadProcessedDocument { logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]); } } - } catch (SQLException e) { - e.printStackTrace(); + } catch (SQLException ex) { + logger.warn("SQL error inserting failed document", ex); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java index 64607b3a..018d76c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java @@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain { stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN"); stmt.execute(""" CREATE PROCEDURE INITIALIZE_DOMAIN ( - IN ST INT, + IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'), IN IDX INT, - IN QUAL DOUBLE, - IN DID INT) + IN DID INT, + IN IP VARCHAR(32)) BEGIN - UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID; + UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; END """); @@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain { } } - public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) { + public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) { data.setTargetDomain(domain); loadDomains.load(data, domain); @@ -49,18 +49,17 @@ public class SqlLoadProcessedDomain { try (var conn = dataSource.getConnection(); var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)")) { - initCall.setInt(1, state.code); + initCall.setString(1, state.name()); initCall.setInt(2, 1 + data.sizeHint / 100); - initCall.setDouble(3, quality); - initCall.setInt(4, data.getDomainId(domain)); + initCall.setInt(3, data.getDomainId(domain)); + initCall.setString(4, ip); int rc = initCall.executeUpdate(); if (rc < 1) { - logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc); + logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); } - conn.commit(); } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error initializing domain", ex); } } @@ -69,9 +68,9 @@ public class SqlLoadProcessedDomain { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" UPDATE EC_DOMAIN TARGET - INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=? + INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=? SET TARGET.DOMAIN_ALIAS=ALIAS.ID - WHERE TARGET.URL_PART=? + WHERE TARGET.DOMAIN_NAME=? """)) { stmt.setString(1, link.to().toString()); stmt.setString(2, link.from().toString()); @@ -81,7 +80,7 @@ public class SqlLoadProcessedDomain { } } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error inserting domain alias", ex); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index 7d8851ca..ba9ae43a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -1,11 +1,13 @@ package nu.marginalia.wmsa.edge.converting.loader; +import com.google.common.hash.Hashing; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.charset.StandardCharsets; import java.sql.SQLException; import java.sql.Types; @@ -25,12 +27,13 @@ public class SqlLoadUrls { stmt.execute(""" CREATE PROCEDURE INSERT_URL ( IN PROTO VARCHAR(255), - IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, + IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN PORT INT, - IN URL VARCHAR(255) + IN PATH VARCHAR(255), + IN PATH_HASH BIGINT ) BEGIN - INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME; + INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; END """); } @@ -42,8 +45,8 @@ public class SqlLoadUrls { public void load(LoaderData data, EdgeUrl[] urls) { try (var conn = dataSource.getConnection(); - var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)"); - var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?") + var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)"); + var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?") ) { conn.setAutoCommit(false); @@ -58,6 +61,7 @@ public class SqlLoadUrls { insertCall.setNull(3, Types.INTEGER); } insertCall.setString(4, url.path); + insertCall.setLong(5, hashPath(url.path)); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -86,7 +90,11 @@ public class SqlLoadUrls { } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error inserting URLs", ex); } } + + private long hashPath(String path) { + return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong(); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java index d36cb830..b75de436 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java @@ -15,7 +15,7 @@ public class InstructionsCompiler { public List compile(ProcessedDomain domain) { List ret = new ArrayList<>(domain.size()*4); - ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.))); + ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); if (domain.documents != null) { compileUrls(ret, domain.documents); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java index 2f25d6d7..52fe338a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java @@ -34,11 +34,10 @@ public class CrawlJobExtractorMain { private static final String domainsSql = """ - SELECT ID, LOWER(EC_DOMAIN.URL_PART) + SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME) FROM EC_DOMAIN - WHERE QUALITY_RAW>-100 - AND INDEXED>0 - AND STATE<2 + WHERE INDEXED>0 + AND STATE='ACTIVE' OR STATE='EXHAUSTED' ORDER BY INDEX_DATE ASC, DISCOVER_DATE ASC, @@ -49,8 +48,8 @@ public class CrawlJobExtractorMain { private static final String urlsSql = """ - SELECT CONCAT(PROTO, "://", ?, URL) - FROM EC_URL + SELECT URL + FROM EC_URL_VIEW WHERE DOMAIN_ID=? ORDER BY VISITED DESC, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java index 21935fd0..ea1946fc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java @@ -6,6 +6,7 @@ import com.google.common.hash.Hashing; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; @@ -30,19 +31,19 @@ public class CrawlJobExtractorPageRankMain { """ SELECT ID FROM EC_DOMAIN - WHERE URL_PART=? + WHERE DOMAIN_NAME=? """; private static final String specificDomainSqlFromId = """ - SELECT LOWER(URL_PART) + SELECT LOWER(DOMAIN_NAME) FROM EC_DOMAIN WHERE ID=? """; private static final String urlsSql = """ - SELECT CONCAT(PROTO, "://", ?, URL) - FROM EC_URL + SELECT URL + FROM EC_URL_VIEW WHERE DOMAIN_ID=? ORDER BY VISITED DESC, @@ -73,10 +74,12 @@ public class CrawlJobExtractorPageRankMain { Gson gson = new GsonBuilder().create(); - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); rpr.setMaxKnownUrls(750); - var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false); + var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size()); try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java index 81e8dd58..2f309b07 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java @@ -13,44 +13,14 @@ import java.util.Optional; @ImplementedBy(EdgeDataStoreDaoImpl.class) public interface EdgeDataStoreDao { - boolean isBlacklisted(EdgeDomain domain); - EdgeId getDomainId(EdgeDomain domain); - EdgeId getUrlId(EdgeUrl domain); - EdgeUrl getUrl(EdgeId id); - EdgeUrlDetails getUrlDetails(EdgeId id); - List getDomainNeighbors(EdgeId domainId, EdgeDomainBlacklist backlist, int count); List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist backlist, int count); + List getRandomDomains(int count, EdgeDomainBlacklist backlist); List getUrlDetailsMulti(List> ids); - List> getDomainIdsFromUrlIds(Collection> urlIds); - EdgeDomain getDomain(EdgeId id); - List> inboudUrls(EdgeId id, int limit); - List> outboundUrls(EdgeId id, int limit); - Optional> resolveAmbiguousDomain(String name); - - - int getPagesKnown(EdgeId domainId); - int getPagesVisited(EdgeId domainId); - int getPagesIndexed(EdgeId domainId); - - int getIncomingLinks(EdgeId domainId); - int getOutboundLinks(EdgeId domainId); - - double getDomainQuality(EdgeId domainId); - - EdgeDomainIndexingState getDomainState(EdgeId domainId); - - List getLinkingDomains(EdgeId domainId); - - List getNewUrls(EdgeId domainId, Collection links); - - double getRank(EdgeId domainId); - - void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index a214bb15..30ea2256 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -33,7 +33,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { private final Cache> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build(); private final Cache> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); - private static final String DEFAULT_PROTOCOL = "http"; public static double QUALITY_LOWER_BOUND_CUTOFF = -15.; @Inject public EdgeDataStoreDaoImpl(HikariDataSource dataSource) @@ -48,30 +47,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { domainIdCache.invalidateAll(); } - @SneakyThrows - @Override - public boolean isBlacklisted(EdgeDomain domain) { - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { - stmt.setString(1, domain.domain); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return true; - } else { - return false; - } - } - } - } - @SneakyThrows @Override public EdgeId getDomainId(EdgeDomain domain) { try (var connection = dataSource.getConnection()) { return domainIdCache.get(domain, () -> { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { stmt.setString(1, domain.toString()); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -86,104 +68,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - @Override - @SneakyThrows - public EdgeId getUrlId(EdgeUrl url) { - try (var connection = dataSource.getConnection()) { - - return urlIdCache.get(url, () -> { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) { - stmt.setString(1, url.path); - stmt.setString(2, url.domain.toString()); - stmt.setString(3, url.proto); - - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return new EdgeId<>(rsp.getInt(1)); - } - } - // Lenient mode for http->https upgrades etc - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) { - stmt.setString(1, url.path); - stmt.setString(2, url.domain.toString()); - - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return new EdgeId<>(rsp.getInt(1)); - } - } - throw new NoSuchElementException(url.toString()); - }); - } - catch (UncheckedExecutionException ex) { - throw ex.getCause(); + private String idList(List> ids) { + StringJoiner j = new StringJoiner(",", "(", ")"); + for (var id : ids) { + j.add(Integer.toString(id.getId())); } + return j.toString(); } - - @SneakyThrows - @Override - public List> getDomainIdsFromUrlIds(Collection> urlIds) { - List> results = new ArrayList<>(urlIds.size()); - - if (urlIds.isEmpty()) - return results; - - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds - .stream() - .map(EdgeId::getId) - .map(Object::toString) - .collect(Collectors.joining(",", "(", ")")))) - { - var rsp = stmt.executeQuery(); - while (rsp.next()) { - results.add(new EdgeId<>(rsp.getInt(1))); - } - - } - } - - return results; - } - - static final Pattern badChars = Pattern.compile("[';\\\\]"); - private String saneString(String s) { - return "\'"+badChars.matcher(s).replaceAll("?")+"\'"; - } - @SneakyThrows - @Override - public EdgeUrl getUrl(EdgeId id) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.createStatement()) { - var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId()); - if (rsp.next()) { - return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4)); - } - throw new NoSuchElementException(); - } - } - } - - @SneakyThrows - @Override - public EdgeUrlDetails getUrlDetails(EdgeId id) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.createStatement()) { - var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId()); - if (rsp.next()) { - EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5)); - return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17)); - } - throw new NoSuchElementException(); - } - } - } - - @SneakyThrows @Override public List getUrlDetailsMulti(List> ids) { @@ -193,16 +85,39 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { List result = new ArrayList<>(ids.size()); try (var connection = dataSource.getConnection()) { - // This is SQL-injection safe, the IDs are of type int - String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")")); - try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) { + String idString = idList(ids); + + try (var stmt = connection.prepareStatement( + """ + SELECT ID, URL, + TITLE, DESCRIPTION, + QUALITY, + WORDS_TOTAL, FORMAT, FEATURES, + IP, DOMAIN_STATE, + DATA_HASH + FROM EC_URL_VIEW WHERE ID IN + """ + idString)) { stmt.setFetchSize(ids.size()); var rsp = stmt.executeQuery(); while (rsp.next()) { - EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5)); - var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17)); + EdgeUrl url = new EdgeUrl(rsp.getString(2)); + var val = new EdgeUrlDetails(rsp.getInt(1), url, + rsp.getString(3), // title + rsp.getString(4), // description + rsp.getDouble(5), // quality + rsp.getInt(6), // wordsTotal + rsp.getString(7), // format + rsp.getInt(8), // features + rsp.getString(9), // ip + EdgeDomainIndexingState.valueOf(rsp.getString(10)), // domainState + rsp.getInt(11), // dataHash + EdgePageScoreAdjustment.zero(), // urlQualityAdjustment + Integer.MAX_VALUE, // rankingId + Double.MAX_VALUE, // termScore + 0 // queryLength + ); if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) { result.add(val); } @@ -214,82 +129,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { return result; } - @Override - public List getDomainNeighbors(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { - final Set domains = new HashSet<>(count*3); - - final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?"; - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement(q)) { - stmt.setFetchSize(count); - stmt.setInt(1, domainId.getId()); - stmt.setInt(2, count); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); - } - } - } - - final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?"; - try (var stmt = connection.prepareStatement(q2)) { - - stmt.setFetchSize(count); - stmt.setInt(1, domainId.getId()); - stmt.setInt(2, count); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); - } - } - } - - final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?"; - try (var stmt = connection.prepareStatement(q3)) { - stmt.setFetchSize(count); - stmt.setInt(1, domainId.getId()); - stmt.setInt(2, count); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); - } - } - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - - - return new ArrayList<>(domains); - } - @Override public List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { final Set domains = new HashSet<>(count*3); final String q = """ - SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART, COUNT(*) AS CNT + SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT FROM EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID @@ -316,16 +162,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } if (domains.size() < count/2) { final String q2 = """ - SELECT EC_DOMAIN.ID, URL_PART + SELECT EC_DOMAIN.ID, DOMAIN_NAME FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID @@ -347,9 +191,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -357,11 +199,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { if (domains.size() < count/2) { final String q3 = """ - SELECT EC_DOMAIN.ID, URL_PART - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + SELECT EC_DOMAIN.ID, DOMAIN_NAME + FROM EC_DOMAIN + INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID - INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID + INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE B.DEST_DOMAIN_ID=? AND STATE<2 AND KNOWN_URLS<1000 @@ -381,9 +223,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -399,7 +239,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { @Override public List getRandomDomains(int count, EdgeDomainBlacklist blacklist) { - final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?"; + final String q = """ + SELECT DOMAIN_ID, DOMAIN_NAME + FROM EC_RANDOM_DOMAINS + INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID + WHERE STATE<2 + AND DOMAIN_ALIAS IS NULL + ORDER BY RAND() + LIMIT ? + """; List domains = new ArrayList<>(count); try (var conn = dataSource.getConnection()) { try (var stmt = conn.prepareStatement(q)) { @@ -410,9 +258,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -428,7 +274,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { public EdgeDomain getDomain(EdgeId id) { try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) { + try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { stmt.setInt(1, id.getId()); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -439,330 +285,4 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - @Override @SneakyThrows - public List> inboudUrls(EdgeId id, int limit) { - - List> ret = new ArrayList<>(); - try (var connection = dataSource.getConnection()) { - - try (var stmt = - connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) { - stmt.setFetchSize(limit); - stmt.setInt(1, id.getId()); - stmt.setInt(2, limit); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - ret.add(new EdgeId<>(rsp.getInt(1))); - } - } - - } - - return ret; - } - - - @Override @SneakyThrows - public List> outboundUrls(EdgeId id, int limit) { - - List> ret = new ArrayList<>(); - try (var connection = dataSource.getConnection()) { - - try (var stmt = - connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) { - stmt.setFetchSize(limit); - stmt.setInt(1, id.getId()); - stmt.setInt(2, limit); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - ret.add(new EdgeId<>(rsp.getInt(1))); - } - } - - } - - return ret; - } - - @Override - public Optional> resolveAmbiguousDomain(String name) { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { - stmt.setString(1, name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { - stmt.setString(1, "https://"+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { - stmt.setString(1, "http://"+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { - stmt.setString(1, "https://www."+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { - stmt.setString(1, "http://www."+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - } catch (SQLException throwables) { - logger.info("Could not resolve domain id for {}", name); - } - - return Optional.empty(); - } - - @SneakyThrows - @Override - public int getPagesKnown(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - @Override - public int getPagesVisited(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - - @SneakyThrows - @Override - public int getPagesIndexed(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - @Override - public int getIncomingLinks(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - @SneakyThrows - @Override - public int getOutboundLinks(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - @Override - public double getDomainQuality(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return -5; - } - } - - @Override - public EdgeDomainIndexingState getDomainState(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return EdgeDomainIndexingState.fromCode(rsp.getInt(1)); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return EdgeDomainIndexingState.ERROR; - } - - @Override - public List getLinkingDomains(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - List results = new ArrayList<>(25); - try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - results.add(new EdgeDomain(rsp.getString(1))); - } - return results; - } catch (Exception ex) { - logger.error("DB error", ex); - } - - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return Collections.emptyList(); - } - - @Override - public List getNewUrls(EdgeId domainId, Collection links) { - Map edgeUrlByPath = links.stream().collect(Collectors.toMap(EdgeUrl::getPath, Function.identity(), (a,b)->a)); - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT URL FROM EC_URL WHERE DOMAIN_ID=?")) { - stmt.setFetchSize(500); - stmt.setInt(1, domainId.getId()); - var rs = stmt.executeQuery(); - while (rs.next()) { - edgeUrlByPath.remove(rs.getString(1)); - } - } - } - catch (Exception ex) { - return Collections.emptyList(); - } - return new ArrayList<>(edgeUrlByPath.values()); - - } - - @Override - public double getRank(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return 1; - } - - @Override - public void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed) { - try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement("UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=?, DOMAIN_ALIAS=?, INDEXED=GREATEST(INDEXED,?) WHERE ID=?")) { - stmt.setInt(1, state.code); - if (null == alias) { - stmt.setNull(2, Types.INTEGER); - } - else { - stmt.setInt(2, getDomainId(alias).getId()); - } - - stmt.setInt(3, minIndexed); - stmt.setInt(4, getDomainId(domain).getId()); - stmt.executeUpdate(); - connection.commit(); - } - catch (SQLException throwables) { - logger.error("SQL error", throwables); - } - } - - @SneakyThrows - private double getDomainQuality(Connection connection, EdgeDomain src) { - try (var stmt = connection.prepareStatement("SELECT QUALITY_RAW FROM EC_DOMAIN WHERE URL_PART=?")) { - stmt.setString(1, src.toString()); - var res = stmt.executeQuery(); - - if (res.next()) { - var q = res.getDouble(1); - if (q > 0.5) { - logger.warn("gDQ({}) -> 1", src); - } - return 0; - } - } - catch (SQLException ex) { - logger.error("DB error", ex); - } - - return -5; - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java index f4cbb8d0..334ec5a9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java @@ -50,7 +50,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist { final TIntHashSet result = new TIntHashSet(1_000_000); try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) { + try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) { stmt.setFetchSize(1000); var rsp = stmt.executeQuery(); while (rsp.next()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java index 2e8fdcd2..05bcfe75 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.radix; +package nu.marginalia.wmsa.edge.index; import nu.marginalia.wmsa.edge.index.EdgeIndexControl; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriter; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.service.query.Query; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriter; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.reader.query.Query; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index b590af55..ab7c73fe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.Inject; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException; +import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; + +import java.io.IOException; public class EdgeIndexControl { @@ -27,7 +29,10 @@ public class EdgeIndexControl { System.gc(); } catch (ConversionUnnecessaryException unnecessary) { - + // swallow quietly + } + catch (IOException e) { + e.printStackTrace(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index a04a4c83..de6276a8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -15,9 +15,9 @@ import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.edge.index.model.*; -import nu.marginalia.wmsa.edge.index.service.SearchIndexes; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index fb58ac0e..61e64b41 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -5,12 +5,16 @@ import com.google.inject.Singleton; import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.*; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -89,7 +93,7 @@ public class IndexServicesFactory { } - public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException { + public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { return new SearchIndexConverter(block, id, tmpFileDir, preconverterOutputFile.get(id), indexWriteWordsFile.get(id, block.id), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java index fd7f529f..2242f476 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.conversion; public class ConversionUnnecessaryException extends Exception { public ConversionUnnecessaryException() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java index abaced82..220a9708 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.wmsa.edge.index.conversion; import gnu.trove.list.TIntList; import gnu.trove.map.hash.TIntIntHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java index c9b69386..0827b4e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import com.google.inject.name.Named; @@ -6,9 +6,10 @@ import gnu.trove.set.hash.TIntHashSet; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter; import nu.marginalia.util.btree.BTreeWriter; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.multimap.MultimapFileLong; @@ -32,18 +33,24 @@ public class SearchIndexConverter { private final long fileLength; private final long urlsFileSize; + private final Path tmpFileDir; + private final FileChannel urlsTmpFileChannel; private final int wordCount; private final MultimapFileLong urlsTmpFileMap; private final Logger logger = LoggerFactory.getLogger(getClass()); private final IndexBlock block; private final int bucketId; - @org.jetbrains.annotations.NotNull + + private final File urlsFile; private final SearchIndexPartitioner partitioner; private final TIntHashSet spamDomains; private final MultimapSorter urlTmpFileSorter; + private final static int internalSortLimit = + Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256; + @SneakyThrows public static long wordCount(File inputFile) { try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) { @@ -52,7 +59,6 @@ public class SearchIndexConverter { } } - @SneakyThrows @Inject public SearchIndexConverter(IndexBlock block, int bucketId, @Named("tmp-file-dir") Path tmpFileDir, @@ -61,13 +67,15 @@ public class SearchIndexConverter { @Named("edge-index-write-urls-file") File outputFileUrls, SearchIndexPartitioner partitioner, EdgeDomainBlacklist blacklist) - throws ConversionUnnecessaryException + throws ConversionUnnecessaryException, IOException { this.block = block; this.bucketId = bucketId; - urlsFile = outputFileUrls; + this.tmpFileDir = tmpFileDir; + this.urlsFile = outputFileUrls; this.partitioner = partitioner; this.spamDomains = blacklist.getSpamDomains(); + logger.info("Converting {} ({}) {}", block.id, block, inputFile); Files.deleteIfExists(outputFileWords.toPath()); @@ -89,18 +97,16 @@ public class SearchIndexConverter { urlsFileSize = getUrlsSize(buffer, inputChannel); var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - - var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); urlsTmpFileChannel = urlsTmpFileRaf.getChannel(); urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false); - urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256); + urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block); - long[] wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); + WordIndexOffsetsTable wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block); - createUrlTable(tmpFileDir, buffer, raf, wordIndexTable); + createUrlTable(buffer, raf, wordIndexTable); Files.delete(tmpUrlsFile); raf.close(); @@ -140,99 +146,69 @@ public class SearchIndexConverter { return reader.size; } - private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException { - logger.debug("Table size = {}", wordIndexTable.length); - int[] wordIndex = new int[wordIndexTable.length]; + private void createUrlTable(ByteBuffer buffer, RandomAccessFile raf, WordIndexOffsetsTable wordOffsetsTable) throws IOException { + logger.info("Table size = {}", wordOffsetsTable.length()); + raf.seek(FILE_HEADER_SIZE); var channel = raf.getChannel(); try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) { - var reader = new IndexReader(buffer, channel) { + int[] wordWriteOffset = new int[wordOffsetsTable.length()]; + + new IndexReader(buffer, channel) { @Override public void eachWord(long urlId, int wordId) throws IOException { - if (wordId >= wordIndex.length) + if (wordId >= wordWriteOffset.length) return; - if (wordId != 0) { - if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) { - logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}", - wordId, - wordIndex[wordId], - wordIndexTable[wordId - 1], - wordIndexTable[wordId]); - throw new IllegalStateException(); - } - } if (wordId > 0) { - rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId)); + rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, translateUrl(urlId)); } else { - rwf.put(wordIndex[wordId]++, translateUrl(urlId)); + rwf.put(wordWriteOffset[wordId]++, translateUrl(urlId)); } } - }; - - reader.read(); + }.read(); rwf.write(urlsTmpFileChannel); } urlsTmpFileChannel.force(false); + logger.info("URL TMP Table: {} Mb", channel.position()/(1024*1024)); - logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024)); + if (wordOffsetsTable.length() > 0) { + logger.info("Sorting urls table"); + + wordOffsetsTable.forEach(urlTmpFileSorter::sort); - if (wordIndexTable.length > 0) { - logger.debug("Sorting urls table"); - sortUrls(wordIndexTable); urlsTmpFileMap.force(); } else { logger.warn("urls table empty -- nothing to sort"); } - - long idx = 0; - + logger.info("Writing BTree"); try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) { var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); - if (wordIndexTable[0] != 0) { - int start = 0; - int end = (int) wordIndexTable[0]; + wordOffsetsTable.fold((accumulatorIdx, start, length) -> { + // Note: The return value is accumulated into accumulatorIdx! - idx += writer.write(idx, (int) wordIndexTable[0], - offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); - } + return writer.write(accumulatorIdx, length, + slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length)); + }); - for (int i = 1; i < wordIndexTable.length; i++) { - if (wordIndexTable[i] != wordIndexTable[i - 1]) { - long start = wordIndexTable[i-1]; - long end = wordIndexTable[i]; - - idx += writer.write(idx, (int) (end-start), - offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); - } - } } catch (Exception e) { - e.printStackTrace(); + logger.error("Error while writing BTree", e); } } - @SneakyThrows - private void sortUrls(long[] wordIndices) { - urlTmpFileSorter.sort( 0, (int) wordIndices[0]); - - for (int i = 1; i < wordIndices.length; i++) { - urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1])); - } - } - - private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception { + private WordIndexOffsetsTable createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws IOException { inputChannel.position(FILE_HEADER_SIZE); logger.debug("Table size = {}", wordCount); WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount); - ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE); + ByteBuffer buffer = ByteBuffer.allocateDirect(8* SearchIndexWriterImpl.MAX_BLOCK_SIZE); logger.debug("Reading words"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java similarity index 64% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java index 615fbc34..fcf6d175 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -10,7 +10,7 @@ import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterStandardPageRank; import nu.marginalia.util.ranking.BuggyStandardPageRank; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.edge.index.model.RankingSettings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,41 +18,28 @@ import org.slf4j.LoggerFactory; @Singleton public class SearchIndexDao { private final HikariDataSource dataSource; + private RankingDomainFetcher rankingDomains; private final RankingSettings rankingSettings; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject public SearchIndexDao(HikariDataSource dataSource, + RankingDomainFetcher rankingDomains, RankingSettings rankingSettings) { this.dataSource = dataSource; + this.rankingDomains = rankingDomains; this.rankingSettings = rankingSettings; logger.info("SearchIndexDao ranking settings = {}", rankingSettings); } - @SneakyThrows - public TIntHashSet getSpamDomains() { - final TIntHashSet result = new TIntHashSet(1_000_000); - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) { - var rsp = stmt.executeQuery(); - while (rsp.next()) { - result.add(rsp.getInt(1)); - } - } - } - - return result; - } - @SneakyThrows public TIntHashSet goodUrls() { TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1); TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1); try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND STATE>=0")) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) { stmt.setFetchSize(10_000); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -79,36 +66,36 @@ public class SearchIndexDao { @SneakyThrows public TIntList getRetroDomains() { - var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2, false); + var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2); } @SneakyThrows public TIntList getSmallWebDomains() { - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new)); + var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new)); rpr.setMaxKnownUrls(750); - return rpr.pageRankWithPeripheralNodes(rpr.size(), false); + return rpr.pageRankWithPeripheralNodes(rpr.size()); } @SneakyThrows public TIntList getAcademiaDomains() { - var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2, false); + var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2); } @SneakyThrows public TIntList getStandardDomains() { - var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2, false); + var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2); } @SneakyThrows public TIntList getSpecialDomains() { TIntArrayList results = new TIntArrayList(); try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2") + var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'") ) { var rs = stmt.executeQuery(); while (rs.next()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java index cf281116..bf5a1d74 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java @@ -1,11 +1,9 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import com.google.inject.Singleton; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.service.SearchEngineRanking; -import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java index 5149b546..9e851025 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java @@ -1,10 +1,9 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java new file mode 100644 index 00000000..464e9388 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java @@ -0,0 +1,10 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +public class WordIndexLengthsTable { + final long[] table; + + public WordIndexLengthsTable(int size) { + this.table = new long[size]; + } + public void increment(int idx) { table[idx]++; } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java new file mode 100644 index 00000000..29b88509 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java @@ -0,0 +1,67 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +import java.io.IOException; + +public class WordIndexOffsetsTable { + final long[] table; + public final int numberOfUsedWords; + + public WordIndexOffsetsTable(long[] table, int numberOfUsedWords) { + + this.table = table; + this.numberOfUsedWords = numberOfUsedWords; + } + + public int length() { + return table.length; + } + + public void forEach(OffsetTableEntryConsumer o) throws IOException { + if (table[0] > 0) { + o.accept(0, (int) table[0]); + } + + for (int i = 1; i < table.length; i++) { + long start = table[i-1]; + int length = (int) (table[i] - start); + + if (length != 0) { + o.accept(start, length); + } + } + } + + /** + * Fold over each span in the file, left to right + */ + public long fold(OffsetTableEntryFoldConsumer o) throws IOException { + long total = 0; + + if (table[0] > 0) { + total = o.accept(total,0, (int) table[0]); + } + + for (int i = 1; i < table.length; i++) { + long start = table[i-1]; + int length = (int) (table[i] - start); + + if (length != 0) { + total += o.accept(total, start, length); + } + } + + return total; + } + + public long get(int i) { + return table[i]; + } + + public interface OffsetTableEntryConsumer { + void accept(long start, int length) throws IOException; + } + + public interface OffsetTableEntryFoldConsumer { + long accept(long accumulator, long start, int length) throws IOException; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java new file mode 100644 index 00000000..2056948b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java @@ -0,0 +1,56 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +/** Contains a stateful table of word index offsets, initially in lengths mode + * where the table contains how many postings exist for each word; then in offsets + * mode, where the lengths are converted into the necessary offsets for each block + * of document data. + * + * Caveat! This uses the same underlying array to conserve space. + * + */ +public class WordIndexTables { + private WordIndexLengthsTable lengthsTable; + private WordIndexOffsetsTable offsetsTable; + + private boolean converted = false; + + public WordIndexTables(int size) { + lengthsTable = new WordIndexLengthsTable(size); + } + + public WordIndexLengthsTable lengths() { + if (converted) throw new IllegalStateException("Table has been converted"); + + return lengthsTable; + } + + public WordIndexOffsetsTable offsets() { + if (!converted) throw new IllegalStateException("Table has not been converted"); + + return offsetsTable; + } + + public void convert() { + if (converted) throw new IllegalStateException("Table has been converted"); + + // Go from lengths to offsets, i.e. + // BEFORE: 1, 2, 1, 3, 0, 2 + // AFTER: 1, 3, 4, 7, 7, 9 + + long[] table = lengthsTable.table; + int numberOfUsedWords = 0; + + if (table[0] != 0) numberOfUsedWords = 1; + + for (int i = 1; i < table.length; i++) { + if (table[i] != 0) { + numberOfUsedWords++; + } + table[i] += table[i-1]; + } + + lengthsTable = null; + offsetsTable = new WordIndexOffsetsTable(table, numberOfUsedWords); + converted = true; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java new file mode 100644 index 00000000..7f762ff3 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java @@ -0,0 +1,75 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +import nu.marginalia.util.btree.BTreeWriter; +import nu.marginalia.util.btree.model.BTreeContext; +import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; +import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; + +import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext; + +public class WordsTableWriter { + private final WordIndexTables table; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8); + + public WordsTableWriter(int length) { + table = new WordIndexTables(length); + } + + public void acceptWord(int wordId) { + table.lengths().increment(wordId); + } + + public WordIndexOffsetsTable getTable() { + return table.offsets(); + } + + public void write(File file) throws IOException { + table.convert(); + + logger.info("Writing table - {} max", table.offsets().numberOfUsedWords); + + final int tableSize = table.offsets().numberOfUsedWords; + + try (var mmf = MultimapFileLong.forOutput(file.toPath(), tableSize/8L)) { + mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal()); + long offset = 1; + + var writer = new BTreeWriter(mmf, wordsBTreeContext); + + writer.write(offset, tableSize, this::writeBTreeBlock); + } + } + + private void writeBTreeBlock(MultimapFileLongSlice mapSlice) { + long urlFileOffset = 0; + int idx = 0; + + var offsetTable = table.offsets().table; + + if (offsetTable[0] != 0) { + int length = (int) offsetTable[0]; + mapSlice.put(idx++, (long)length<<32); + mapSlice.put(idx++, 0); + + urlFileOffset += (urlsBTreeContext.calculateSize(length)); + } + + for (int i = 1; i < offsetTable.length; i++) { + final int length = (int)(offsetTable[i] - offsetTable[i-1]); + + if (length > 0) { + mapSlice.put(idx++, (long)length << 32 | i); + mapSlice.put(idx++, urlFileOffset); + + urlFileOffset += (urlsBTreeContext.calculateSize(length)); + } + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryReader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java index 90d270d2..ca10c000 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.dictionary; +package nu.marginalia.wmsa.edge.index.dictionary; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryWriter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java index 9ce1b149..906231be 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.dictionary; +package nu.marginalia.wmsa.edge.index.dictionary; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/TokenCompressor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/TokenCompressor.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java index 9f26fffd..5a3d73ab 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/TokenCompressor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.dictionary; +package nu.marginalia.wmsa.edge.index.dictionary; import nu.marginalia.util.ByteFolder; import nu.marginalia.util.dict.DictionaryHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java index ca5d70b3..11fc186a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.journal; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeDomain; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java index 2f482815..cf76ada2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.journal; import io.reactivex.rxjava3.disposables.Disposable; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java similarity index 58% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java index 0a6a70c0..2bde1aa7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java @@ -1,36 +1,80 @@ -package nu.marginalia.wmsa.edge.index.service.index.wordstable; +package nu.marginalia.wmsa.edge.index.reader; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.channels.FileChannel; import java.util.function.LongConsumer; -import static nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter.wordsBTreeContext; +import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wordsBTreeContext; -public class BtreeWordsTable extends IndexWordsTable{ - private final MultimapFileLong words; - private final BTreeReader reader; - private final BTreeHeader header; - private final int HEADER_OFFSET = 1; +public class IndexWordsTable implements AutoCloseable { + protected final MultimapFileLong words; + protected final BTreeReader reader; + protected final BTreeHeader header; + protected final int HEADER_OFFSET = 1; + final Logger logger = LoggerFactory.getLogger(getClass()); - public BtreeWordsTable(MultimapFileLong words) { + private static final int BUFFER_SIZE = 1024*1024*64; + + public IndexWordsTable(MultimapFileLong words) { this.words = words; - reader = new BTreeReader(words, wordsBTreeContext); header = reader.getHeader(HEADER_OFFSET); madvise(); } - private void madvise() { + public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException { + var wordsFile = openWordsFile(file); + long signature = wordsFile.get(0); + + if (signature == Strategy.BTREE.ordinal()) { + return new IndexWordsTable(wordsFile); + } + + throw new IllegalArgumentException("Unknown signature " + signature); + } + + private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { + return new MultimapFileLong(wordsFile, + FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false); + } + + public long positionForWord(int wordId) { + + long offset = reader.offsetForEntry(header, wordId); + if (offset < 0) { + return -1L; + } + + return words.get(offset+1); + } + + public int wordLength(int wordId) { + + long offset = reader.offsetForEntry(header, wordId); + if (offset < 0) { + return -1; + } + + return (int)(words.get(offset) >> 32); + } + + protected void madvise() { words.advice(NativeIO.Advice.Random); words.advice0(NativeIO.Advice.WillNeed); var h = reader.getHeader(HEADER_OFFSET); int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs()); + words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); words.pokeRange(h.indexOffsetLongs(), length); } @@ -58,31 +102,13 @@ public class BtreeWordsTable extends IndexWordsTable{ } } - @Override - public long positionForWord(int wordId) { - - long offset = reader.offsetForEntry(header, wordId); - if (offset < 0) { - return -1L; - } - - return words.get(offset+1); - } - - @Override - public int wordLength(int wordId) { - - long offset = reader.offsetForEntry(header, wordId); - if (offset < 0) { - return -1; - } - - return (int)(words.get(offset) >> 32); - } - @Override public void close() throws Exception { words.close(); } + public enum Strategy { + BTREE + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java index 17e62437..042f8f54 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java @@ -1,20 +1,18 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.reader; import com.google.inject.Inject; import com.google.inject.name.Named; import com.upserve.uppend.blobs.NativeIO; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.multimap.MultimapFileLong; -import org.eclipse.jetty.util.thread.ThreadPool; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; -import java.util.concurrent.ForkJoinPool; import java.util.stream.LongStream; public class SearchIndex implements AutoCloseable { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 7baeb8ae..8e7fea81 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.reader; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.query.IndexQueryBuilder; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.service.query.Query; +import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.reader.query.Query; import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -105,10 +105,8 @@ public class SearchIndexReader implements AutoCloseable { .mapToLong(idx -> idx.numUrls(word)) .sum() ); - } - public IndexBlock getBlockForResult(int searchTerm, long urlId) { for (var block : indicesBySearchOrder) { var index = indices.get(block); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java index 91065101..863c0c65 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.wmsa.edge.index.reader; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java index be217057..6f54dd2d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.reader.query; import com.google.common.collect.Streams; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; import java.util.Collection; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java index 2ec30e65..3608f70a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.reader.query; public class IndexSearchBudget { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java index 09f7701b..5f343d54 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.reader.query; import java.util.stream.LongStream; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java deleted file mode 100644 index d1c9f10a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -public enum SearchOrder { - ASCENDING, - REVERSED -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java deleted file mode 100644 index 5b557db1..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java +++ /dev/null @@ -1,48 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service.index.wordstable; - -import nu.marginalia.util.multimap.MultimapFileLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; -import java.util.function.LongConsumer; - -public abstract class IndexWordsTable implements AutoCloseable { - final Logger logger = LoggerFactory.getLogger(getClass()); - - private static final int BUFFER_SIZE = 1024*1024*64; - - public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException { - var wordsFile = openWordsFile(file); - long signature = wordsFile.get(0); - - if (signature == Strategy.BTREE.ordinal()) { - return new BtreeWordsTable(wordsFile); - } - throw new IllegalArgumentException("Unknown signature " + signature); - } - - private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { - return new MultimapFileLong(wordsFile, - FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false); - } - - public abstract long positionForWord(int wordId); - - public abstract int wordLength(int wordId); - public abstract void forEachWordsOffset(LongConsumer offsetConsumer); - - @Override - public void close() throws Exception { - - } - - public record TableWordRange(long start, long end) {} - - public enum Strategy { - FLAT, HASH, BTREE_OLD, BTREE - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java deleted file mode 100644 index 3097dd47..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java +++ /dev/null @@ -1,85 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service.index.wordstable; - -import nu.marginalia.util.btree.BTreeWriter; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.multimap.MultimapFileLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; - -import static nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter.urlsBTreeContext; - -public class WordsTableWriter { - private final long[] table; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8); - - public WordsTableWriter(int length) { - table = new long[length]; - } - - public void acceptWord(int wordId) { - if (wordId >= table.length) { - logger.warn("Invalid word-id {}", wordId); - } - else { - table[wordId]++; - } - } - - public long[] getTable() { - return table; - } - public void write(File file) throws Exception { - - int tableSize = 0; - - if (table[0] != 0) tableSize = 1; - - for (int i = 1; i < table.length; i++) { - if (table[i] != 0) { - tableSize++; - } - table[i] += table[i-1]; - } - - logger.info("Writing table {} words {} max", tableSize, table.length); - - writeBtreeWordsFile(file, table, tableSize); - - } - - private void writeBtreeWordsFile(File outputFileWords, long[] table, int tableSize) throws Exception { - try (var mmf = MultimapFileLong.forOutput(outputFileWords.toPath(), tableSize/8L)) { - mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal()); - long offset = 1; - - var writer = new BTreeWriter(mmf, wordsBTreeContext); - - writer.write(offset, tableSize, (idx) -> { - long urlFileOffset = 0; - - if (table[0] != 0) { - int length = (int) table[0]; - mmf.put(idx++, (long)length<<32); - mmf.put(idx++, 0); - - urlFileOffset += (urlsBTreeContext.calculateSize(length)); - } - - for (int i = 1; i < table.length; i++) { - if (table[i] != table[i - 1]) { - int length = (int)(table[i] - table[i-1]); - mmf.put(idx++, (long)length << 32 | i); - mmf.put(idx++, urlFileOffset); - - urlFileOffset += (urlsBTreeContext.calculateSize(length)); - } - } - }); - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index 53740c95..d1945c9e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -55,8 +55,11 @@ public class EdgeDomain implements WideHashable { } } } + } - + public EdgeUrl toRootUrl() { + // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http + return new EdgeUrl("http", this, null, "/"); } public String toString() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java index 119da59d..b10d0e88 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java @@ -1,27 +1,12 @@ package nu.marginalia.wmsa.edge.model.crawl; public enum EdgeDomainIndexingState { - ACTIVE(0), - EXHAUSTED(1), - SPECIAL(2), - SOCIAL_MEDIA(3), - BLOCKED(-1), - REDIR(-2), - ERROR(-3), - UNKNOWN(-100); - - public final int code; - - EdgeDomainIndexingState(int code) { - this.code = code; - } - - public static EdgeDomainIndexingState fromCode(int code) { - for (var state : values()) { - if (state.code == code) { - return state; - } - } - return UNKNOWN; - } + ACTIVE, + EXHAUSTED, + SPECIAL, + SOCIAL_MEDIA, + BLOCKED, + REDIR, + ERROR, + UNKNOWN } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java index 0063efd9..02c7197a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java @@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.*; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.SearchOrder; import java.util.Arrays; import java.util.Collections; @@ -21,14 +20,13 @@ public class EdgeSearchSpecification { public final int limitTotal; public final String humanQuery; - public final SearchOrder searchOrder; public boolean stagger; public boolean experimental; public static EdgeSearchSpecification justIncludes(String... words) { return new EdgeSearchSpecification( IntStream.range(0, DYNAMIC_BUCKET_LENGTH+1).boxed().toList(), - Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", SearchOrder.ASCENDING, false, false); + Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", false, false); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java index ed5fd013..d46aa79e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java @@ -16,25 +16,24 @@ public class EdgeUrlDetails { public String description; public double urlQuality; - public double urlQualityRaw; - public double domainQuality; - public int links; // DEAD public int words; public String format; public int features; - public EdgePageScoreAdjustment urlQualityAdjustment; - public long rankingId; - public double termScore; public String ip; // BROKEN - public int domainState; - public int queryLength; + public EdgeDomainIndexingState domainState; + public int dataHash; + public EdgePageScoreAdjustment urlQualityAdjustment; + public long rankingId; + public double termScore; + public int queryLength; + public long rankingIdAdjustment() { int penalty = 0; @@ -136,7 +135,7 @@ public class EdgeUrlDetails { return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES); } public boolean isSpecialDomain() { - return domainState == EdgeDomainIndexingState.SPECIAL.code; + return domainState == EdgeDomainIndexingState.SPECIAL; } public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index 10675cc5..66004dee 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.SearchOrder; import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; @@ -136,7 +135,7 @@ public class EdgeSearchOperator { sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block)); - EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", SearchOrder.ASCENDING, EdgeSearchProfile.YOLO.equals(profile), false); + EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", EdgeSearchProfile.YOLO.equals(profile), false); return performQuery(ctx, new EdgeSearchQuery(specs), true); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index 05fcaa04..212d09ab 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -1,7 +1,6 @@ package nu.marginalia.wmsa.edge.search; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.SearchOrder; import java.util.Arrays; import java.util.Collections; @@ -9,27 +8,27 @@ import java.util.List; import java.util.stream.Collectors; public enum EdgeSearchProfile { - DEFAULT("default", SearchOrder.ASCENDING, + DEFAULT("default", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 0, 1), - MODERN("modern", SearchOrder.ASCENDING, + MODERN("modern", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 2), - CORPO("corpo", SearchOrder.ASCENDING, + CORPO("corpo", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 4, 5, 6, 7), - YOLO("yolo", SearchOrder.ASCENDING, + YOLO("yolo", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 0, 2, 1, 3, 4, 6), - CORPO_CLEAN("corpo-clean", SearchOrder.ASCENDING, + CORPO_CLEAN("corpo-clean", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 4, 5), - ACADEMIA("academia", SearchOrder.ASCENDING, + ACADEMIA("academia", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 3), @@ -37,17 +36,15 @@ public enum EdgeSearchProfile { public final String name; - public final SearchOrder order; public final List additionalSearchTerm; public final List buckets; public final List indexBlocks; - EdgeSearchProfile(String name, SearchOrder order, + EdgeSearchProfile(String name, List additionalSearchTerm, List indexBlocks, int... buckets) { this.name = name; - this.order = order; this.additionalSearchTerm = additionalSearchTerm; this.indexBlocks = indexBlocks; this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java index 60520aa9..6e341721 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java @@ -32,7 +32,7 @@ import java.util.regex.Pattern; public class SiteSearchCommand implements SearchCommandInterface { private final EdgeDataStoreDao dataStoreDao; private final EdgeSearchOperator searchOperator; - private DomainInformationService domainInformationService; + private final DomainInformationService domainInformationService; private final Logger logger = LoggerFactory.getLogger(getClass()); private final MustacheRenderer siteInfoRenderer; @@ -91,7 +91,7 @@ public class SiteSearchCommand implements SearchCommandInterface { logger.info("Fetching Site Info: {}", word); var results = domainInformationService.domainInfo(word) - .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); + .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); logger.debug("Results = {}", results); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java index c5c19187..d94ae487 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java @@ -18,7 +18,6 @@ public class DomainInformation { int pagesIndexed; int incomingLinks; int outboundLinks; - double nominalQuality; double ranking; EdgeDomainIndexingState state; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index d3775dd9..1d77a9d0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -138,7 +138,6 @@ public class QueryFactory { .subqueries(subqueries) .limitByBucket(50) .limitTotal(100) - .searchOrder(profile.order) .humanQuery(query) .buckets(profile.buckets); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java index 487e1556..22b24aca 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java @@ -107,7 +107,7 @@ public class SearchResultDecorator { private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) { return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength) - + ((details.domainState == EdgeDomainIndexingState.SPECIAL.code) ? 1.25 : 0); + + ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index 54179d64..2f79a9ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -1,24 +1,43 @@ package nu.marginalia.wmsa.edge.search.siteinfo; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.search.model.DomainInformation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.inject.Inject; import javax.inject.Singleton; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Optional; +/* + TODO: This class needs to be refactored, a lot of + these SQL queries are redundant and can be + collapsed into one single query that fetches + all the information + */ @Singleton public class DomainInformationService { - private EdgeDataStoreDao dataStore; + private EdgeDataStoreDaoImpl dataStoreDao; + private HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public DomainInformationService(EdgeDataStoreDao dataStore) { - this.dataStore = dataStore; + public DomainInformationService( + EdgeDataStoreDaoImpl dataStoreDao, + HikariDataSource dataSource) { + this.dataStoreDao = dataStoreDao; + this.dataSource = dataSource; } @@ -28,29 +47,28 @@ public class DomainInformationService { if (domainId == null) { return Optional.empty(); } - EdgeDomain domain = dataStore.getDomain(domainId); + EdgeDomain domain = dataStoreDao.getDomain(domainId); - boolean blacklisted = dataStore.isBlacklisted(domain); - int pagesKnown = dataStore.getPagesKnown(domainId); - int pagesVisited = dataStore.getPagesVisited(domainId); - int pagesIndexed = dataStore.getPagesIndexed(domainId); - int incomingLinks = dataStore.getIncomingLinks(domainId); - int outboundLinks = dataStore.getOutboundLinks(domainId); - double rank = Math.round(10000.0*(1.0-dataStore.getRank(domainId)))/100; - EdgeDomainIndexingState state = dataStore.getDomainState(domainId); - double nominalQuality = Math.round(100*100*Math.exp(dataStore.getDomainQuality(domainId)))/100.; - List linkingDomains = dataStore.getLinkingDomains(domainId); + boolean blacklisted = isBlacklisted(domain); + int pagesKnown = getPagesKnown(domainId); + int pagesVisited = getPagesVisited(domainId); + int pagesIndexed = getPagesIndexed(domainId); + int incomingLinks = getIncomingLinks(domainId); + int outboundLinks = getOutboundLinks(domainId); + double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; + EdgeDomainIndexingState state = getDomainState(domainId); + List linkingDomains = getLinkingDomains(domainId); - return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains)); + return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, rank, state, linkingDomains)); } private EdgeId getDomainFromPartial(String site) { try { - return dataStore.getDomainId(new EdgeDomain(site)); + return dataStoreDao.getDomainId(new EdgeDomain(site)); } catch (Exception ex) { try { - return dataStore.getDomainId(new EdgeDomain(site)); + return dataStoreDao.getDomainId(new EdgeDomain(site)); } catch (Exception ex2) { return null; @@ -58,4 +76,178 @@ public class DomainInformationService { } } + + @SneakyThrows + public boolean isBlacklisted(EdgeDomain domain) { + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { + stmt.setString(1, domain.domain); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return true; + } else { + return false; + } + } + } + } + + @SneakyThrows + public int getPagesKnown(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + public int getPagesVisited(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + + @SneakyThrows + public int getPagesIndexed(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + public int getIncomingLinks(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + @SneakyThrows + public int getOutboundLinks(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + public double getDomainQuality(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getDouble(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return -5; + } + } + + public EdgeDomainIndexingState getDomainState(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return EdgeDomainIndexingState.valueOf(rsp.getString(1)); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return EdgeDomainIndexingState.ERROR; + } + + public List getLinkingDomains(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + List results = new ArrayList<>(25); + try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + results.add(new EdgeDomain(rsp.getString(1))); + } + return results; + } catch (Exception ex) { + logger.error("DB error", ex); + } + + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return Collections.emptyList(); + } + + public double getRank(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getDouble(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return 1; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java index bb946238..05c67481 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java @@ -3,12 +3,13 @@ package nu.marginalia.wmsa.edge.tools; import com.google.inject.Inject; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.index.model.RankingSettings; -import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexDao; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import org.mariadb.jdbc.Driver; import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; @@ -59,7 +60,9 @@ public class IndexMergerMain { } var hikari = new DatabaseModule().provideConnection(); - var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings())); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, domains, new RankingSettings())); var blacklist = new EdgeDomainBlacklistImpl(hikari); new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist); diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index fc9e515d..36ab040a 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -1,24 +1,11 @@ -DROP TABLE IF EXISTS EC_URL_LINK; -DROP VIEW IF EXISTS EC_PAGE_VIEW; - -DROP TABLE IF EXISTS DISC_DOMAIN_TAG; -DROP TABLE IF EXISTS DISC_TAG; -DROP TABLE IF EXISTS DISC_USER; - -DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS; +DROP TABLE IF EXISTS DOMAIN_METADATA; DROP TABLE IF EXISTS EC_FEED_URL; DROP TABLE IF EXISTS EC_DOMAIN_LINK; DROP TABLE IF EXISTS EC_PAGE_DATA; DROP TABLE IF EXISTS EC_URL; +DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS; DROP TABLE IF EXISTS EC_DOMAIN; -DROP TABLE IF EXISTS EC_TOP_DOMAIN; -DROP TABLE IF EXISTS EC_URL_DETAILS; -DROP VIEW IF EXISTS EC_URL_VIEW; -DROP VIEW IF EXISTS EC_URL_PART_HASH; -DROP TABLE IF EXISTS EC_URL_WORD; -DROP TABLE IF EXISTS EC_DICTIONARY; -DROP TABLE IF EXISTS DOMAIN_METADATA; CREATE TABLE IF NOT EXISTS DOMAIN_METADATA ( ID INT PRIMARY KEY, @@ -27,52 +14,31 @@ CREATE TABLE IF NOT EXISTS DOMAIN_METADATA ( GOOD_URLS INT DEFAULT 0 ); -CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN ( - ID INT PRIMARY KEY AUTO_INCREMENT, - URL_PART VARCHAR(255) UNIQUE NOT NULL, - ALIVE BOOLEAN DEFAULT TRUE NOT NULL -) -CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_DOMAIN ( ID INT PRIMARY KEY AUTO_INCREMENT, - URL_PART VARCHAR(255) UNIQUE NOT NULL, - INDEXED INT DEFAULT 0 NOT NULL, - QUALITY DOUBLE DEFAULT -5 NOT NULL, - QUALITY_RAW DOUBLE DEFAULT -5 NOT NULL, - QUALITY_ORIGINAL DOUBLE DEFAULT -5 NOT NULL, - URL_TOP_DOMAIN_ID INT NOT NULL, - URL_SUBDOMAIN VARCHAR(255) NOT NULL, - STATE INT DEFAULT 0 NOT NULL, + DOMAIN_NAME VARCHAR(255) UNIQUE NOT NULL, + DOMAIN_TOP VARCHAR(255) NOT NULL, + + INDEXED INT DEFAULT 0 NOT NULL COMMENT "~number of documents visited / 100", + STATE ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN') NOT NULL DEFAULT 'active' COMMENT "@see EdgeDomainIndexingState", RANK DOUBLE, - DOMAIN_ALIAS INTEGER, + IP VARCHAR(32), INDEX_DATE TIMESTAMP DEFAULT NOW(), DISCOVER_DATE TIMESTAMP DEFAULT NOW(), - FOREIGN KEY (URL_TOP_DOMAIN_ID) REFERENCES EC_TOP_DOMAIN(ID) ON DELETE CASCADE -) -CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; - -CREATE TABLE IF NOT EXISTS EC_DOMAIN_HISTORY ( - ID INT PRIMARY KEY AUTO_INCREMENT, - URL_PART VARCHAR(255) UNIQUE NOT NULL, - QUALITY_MEASURE DOUBLE DEFAULT -5 NOT NULL, - INBOUND_LINKS INT DEFAULT 1, - LINK_ADJUSTED_QUALITY DOUBLE GENERATED ALWAYS AS (0.3*QUALITY_MEASURE + 0.7*QUALITY_MEASURE / GREATEST(1, INBOUND_LINKS)), - RANK DOUBLE + IS_ALIVE BOOLEAN AS (STATE='ACTIVE' OR STATE='EXHAUSTED' OR STATE='SPECIAL' OR STATE='SOCIAL_MEDIA') VIRTUAL ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_DOMAIN_BLACKLIST ( ID INT PRIMARY KEY AUTO_INCREMENT, - URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL + URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; @@ -81,18 +47,15 @@ CREATE TABLE IF NOT EXISTS EC_URL ( ID INT PRIMARY KEY AUTO_INCREMENT, DOMAIN_ID INT NOT NULL, PROTO ENUM('http','https','gemini') NOT NULL, - URL VARCHAR(255) NOT NULL, + PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin, PORT INT, + PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", VISITED BOOLEAN NOT NULL DEFAULT FALSE, - DATA_HASH INTEGER, - QUALITY_MEASURE DOUBLE, STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', - IP VARCHAR(32), - - CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL), + CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH), FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 @@ -101,13 +64,15 @@ COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( ID INT PRIMARY KEY AUTO_INCREMENT, - TITLE VARCHAR(255), - DESCRIPTION VARCHAR(255), + TITLE VARCHAR(255) NOT NULL, + DESCRIPTION VARCHAR(255) NOT NULL, - WORDS_DISTINCT INTEGER, - WORDS_TOTAL INTEGER, - FORMAT VARCHAR(8), - FEATURES INT, + WORDS_TOTAL INTEGER NOT NULL, + FORMAT ENUM('PLAIN', 'UNKNOWN', 'HTML123', 'HTML4', 'XHTML', 'HTML5', 'MARKDOWN') NOT NULL, + FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL, + + DATA_HASH INTEGER NOT NULL, + QUALITY DOUBLE NOT NULL, FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE ) @@ -115,13 +80,9 @@ CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; CREATE TABLE EC_FEED_URL ( - ID INT PRIMARY KEY AUTO_INCREMENT, - DOMAIN_ID INT NOT NULL, - PROTO VARCHAR(8) NOT NULL, - URL VARCHAR(255) NOT NULL, - PORT INT, + URL VARCHAR(255) PRIMARY KEY, + DOMAIN_ID INT, - CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL), FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 @@ -150,92 +111,63 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK ( FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ); -CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK_AGGREGATE ( - DOMAIN_ID INT PRIMARY KEY NOT NULL, - LINKS INT -); - CREATE OR REPLACE VIEW EC_URL_VIEW AS SELECT - EC_DOMAIN.URL_PART AS URL_DOMAIN, - EC_URL.URL AS URL_PATH, - EC_TOP_DOMAIN.URL_PART AS URL_TOP, + IF(PORT IS NULL, + CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH), + CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH)) + AS URL, + EC_URL.PATH_HASH AS PATH_HASH, + EC_URL.PATH AS PATH, + EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME, + EC_DOMAIN.DOMAIN_TOP AS DOMAIN_TOP, EC_URL.ID AS ID, EC_DOMAIN.ID AS DOMAIN_ID, - EC_TOP_DOMAIN.ID AS TOP_DOMAIN_ID, - EC_URL.PROTO AS URL_PROTO, - EC_URL.PORT AS URL_PORT, + EC_URL.VISITED AS VISITED, - EC_URL.DATA_HASH AS DATA_HASH, - EC_URL.QUALITY_MEASURE AS URL_QUALITY_MEASURE, - EC_DOMAIN.QUALITY AS DOMAIN_QUALITY_MEASURE, - EC_DOMAIN.QUALITY_RAW AS QUALITY_RAW, + + EC_PAGE_DATA.QUALITY AS QUALITY, + EC_PAGE_DATA.DATA_HASH AS DATA_HASH, EC_PAGE_DATA.TITLE AS TITLE, EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION, - EC_URL.IP AS IP, - EC_DOMAIN.STATE AS STATE, EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL, EC_PAGE_DATA.FORMAT AS FORMAT, EC_PAGE_DATA.FEATURES AS FEATURES, + + EC_DOMAIN.IP AS IP, + EC_DOMAIN.STATE AS STATE, EC_DOMAIN.RANK AS RANK, EC_DOMAIN.STATE AS DOMAIN_STATE FROM EC_URL LEFT JOIN EC_PAGE_DATA ON EC_PAGE_DATA.ID = EC_URL.ID INNER JOIN EC_DOMAIN - ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID - INNER JOIN EC_TOP_DOMAIN - ON EC_DOMAIN.URL_TOP_DOMAIN_ID=EC_TOP_DOMAIN.ID; - -CREATE OR REPLACE VIEW EC_DISCOVER_TASKS_VIEW AS - SELECT - ID, - URL_PART - FROM EC_DOMAIN - WHERE - DOMAIN_ALIAS IS NULL - AND INDEXED = 0 - ORDER BY QUALITY DESC, ID ASC; + ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID; CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS SELECT SOURCE_DOMAIN_ID, - SOURCE_DOMAIN.URL_PART AS SOURCE_URL, - SOURCE_TOP_DOMAIN.URL_PART AS SOURCE_TOP_URL, + SOURCE_DOMAIN.DOMAIN_NAME AS SOURCE_DOMAIN, + SOURCE_DOMAIN.DOMAIN_TOP AS SOURCE_TOP_DOMAIN, DEST_DOMAIN_ID, - DEST_DOMAIN.URL_PART AS DEST_URL, - DEST_TOP_DOMAIN.URL_PART AS DEST_TOP_URL + DEST_DOMAIN.DOMAIN_NAME AS DEST_DOMAIN, + DEST_DOMAIN.DOMAIN_TOP AS DEST_TOP_DOMAIN FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID - INNER JOIN EC_TOP_DOMAIN AS SOURCE_TOP_DOMAIN - ON SOURCE_TOP_DOMAIN.ID=SOURCE_DOMAIN.URL_TOP_DOMAIN_ID INNER JOIN EC_DOMAIN AS DEST_DOMAIN ON DEST_DOMAIN.ID=DEST_DOMAIN_ID - INNER JOIN EC_TOP_DOMAIN AS DEST_TOP_DOMAIN - ON DEST_TOP_DOMAIN.ID=DEST_DOMAIN.URL_TOP_DOMAIN_ID ; CREATE OR REPLACE VIEW EC_RELATED_LINKS_IN AS SELECT IN_URL.ID AS SRC_URL_ID, - IN_URL.QUALITY_MEASURE AS SRC_URL_QUALITY, - OUT_URL.ID AS DEST_URL_ID, - OUT_URL.QUALITY_MEASURE AS DEST_URL_QUALITY - FROM EC_URL AS IN_URL - INNER JOIN EC_DOMAIN_LINK - ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID - INNER JOIN EC_URL AS OUT_URL - ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID - WHERE IN_URL.VISITED=TRUE - AND IN_URL.DATA_HASH IS NOT NULL - AND OUT_URL.VISITED=TRUE - AND OUT_URL.DATA_HASH IS NOT NULL; - -CREATE TABLE IF NOT EXISTS EC_DOMAIN_BACKLINKS ( - ID INT PRIMARY KEY, - LINKEDNESS INT -); + OUT_URL.ID AS DEST_URL_ID + FROM EC_DOMAIN_LINK + INNER JOIN EC_URL AS IN_URL ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID + INNER JOIN EC_URL AS OUT_URL ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID + WHERE IN_URL.VISITED AND IN_URL.STATE = 'ok' + AND OUT_URL.VISITED AND OUT_URL.STATE = 'ok'; CREATE TABLE IF NOT EXISTS EC_API_KEY ( LICENSE_KEY VARCHAR(255) UNIQUE, @@ -245,16 +177,8 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY ( RATE INT DEFAULT 10 ); -CREATE INDEX IF NOT EXISTS EC_DOMAIN_RANK_INDEX ON EC_DOMAIN (RANK); -CREATE INDEX IF NOT EXISTS EC_DOMAIN_QUALITY_INDEX ON EC_DOMAIN (QUALITY,STATE); - CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED); -CREATE INDEX IF NOT EXISTS EC_DOMAIN_ID_INDEXED_INDEX ON EC_DOMAIN (ID, INDEXED); -CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, INDEXED, QUALITY); - -CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED); -CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE); -CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP); +CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP); ---; diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb index 5696b251..cd8abf67 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb @@ -10,5 +10,4 @@ Pages Known: {{pagesKnown}} Pages Indexed: {{pagesKnown}} Inbound Links: {{inboundLinks}} Outbound Links: {{outboundLinks}} -Nominal Quality: {{nominalQuality}}% Crawl Ranking: {{ranking}}% \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb index 19b585b8..837f320d 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb @@ -37,7 +37,6 @@

Links

- Nominal Quality: {{nominalQuality}}%
Crawl Ranking: {{ranking}}%
Incoming Links: {{incomingLinks}}
Outbound Links: {{outboundLinks}}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java index 84b9f165..26d397a8 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java @@ -43,7 +43,7 @@ public class TestUtil { logger.info("Running script {}", scriptFile); try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile); var stmt = conn.createStatement()) { - for (String s : new String(scriptStream.readAllBytes()).split(";")) { + for (String s : new String(scriptStream.readAllBytes()).split("(;|---)")) { if (!s.isBlank()) { try { Assertions.assertTrue(stmt.executeUpdate(s) >= 0); diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java index 1915d989..875cda37 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java @@ -90,10 +90,10 @@ class BTreeWriterTest { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + 2L*i, data[i]); - mmf.put(offset + 2L*i + 1, i); + slice.put(2L*i, data[i]); + slice.put( 2L*i + 1, i); } }); mmf.force(); @@ -133,10 +133,10 @@ class BTreeWriterTest { { var writer = new BTreeWriter(mmf, ctx); - writer.write( 0, toPut.size(), (offset) -> { + writer.write( 0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + 2L*i, data[i]); - mmf.put(offset + 2L*i + 1, i); + slice.put(2L*i, data[i]); + slice.put(2L*i + 1, i); } }); mmf.force(); @@ -182,9 +182,9 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + i, data[i]); + slice.put(i, data[i]); } }); mmf.force(); @@ -235,9 +235,9 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + i, data[i]); + slice.put(i, data[i]); } }); mmf.force(); @@ -288,10 +288,10 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + i*2L, data[i]); - mmf.put(offset + i*2L+1, i); + slice.put(i*2L, data[i]); + slice.put(i*2L+1, i); } }); mmf.force(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java index 326c9b15..9331a998 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java @@ -27,7 +27,7 @@ class LongPairHashMapTest { try { RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); - var lphm = new LongPairHashMap(mmf, 1024); + var lphm = LongPairHashMap.createNew(mmf, 1024); toPut.forEach(i -> { lphm.put(new LongPairHashMap.CellData(i, i)); }); @@ -36,7 +36,7 @@ class LongPairHashMapTest { RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw"); MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); - var lphm2 = new LongPairHashMap(mmf2); + var lphm2 = LongPairHashMap.loadExisting(mmf2); toPut.forEach(i -> { Assertions.assertTrue(lphm2.get(i).isSet()); Assertions.assertEquals(i, (int) lphm2.get(i).getKey()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java new file mode 100644 index 00000000..d839bbb2 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java @@ -0,0 +1,48 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +@Testcontainers +class SqlLoadDomainLinksTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); + + HikariDataSource dataSource; + LoaderData loaderData; + @BeforeEach + public void setUp() { + dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + + var loadDomains = new SqlLoadDomains(dataSource); + loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); + } + + @AfterEach + public void tearDown() { + dataSource.close(); + } + + @Test + public void loadDomainLinks() { + var loader = new SqlLoadDomainLinks(dataSource); + loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) }); + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java new file mode 100644 index 00000000..25dd18b4 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java @@ -0,0 +1,52 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import static org.junit.jupiter.api.Assertions.*; + +@Testcontainers +class SqlLoadDomainsTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); + + @Test + public void loadDomain() { + + try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { + var loadDomains = new SqlLoadDomains(dataSource); + var loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); + + assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0); + } + + } + + @Test + public void loadDomains() { + + try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { + var loadDomains = new SqlLoadDomains(dataSource); + var loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); + + assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0); + assertTrue(loaderData.getDomainId(new EdgeDomain("memex.marginalia.nu")) >= 0); + } + + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java new file mode 100644 index 00000000..ecb0e88a --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java @@ -0,0 +1,94 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; +import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; +import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.net.URISyntaxException; +import java.util.List; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +@Testcontainers +class SqlLoadProcessedDocumentTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); + + HikariDataSource dataSource; + LoaderData loaderData; + EdgeDataStoreDaoImpl dataStoreDao; + + @BeforeEach + public void setUp() throws URISyntaxException { + dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + dataStoreDao = new EdgeDataStoreDaoImpl(dataSource); + + var loadDomains = new SqlLoadDomains(dataSource); + var loadUrls = new SqlLoadUrls(dataSource); + + loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); + + loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")}); + + } + + @AfterEach + public void tearDown() { + dataStoreDao.clearCaches(); + dataSource.close(); + } + + @Test + public void loadProcessedDocument() throws URISyntaxException { + var loader = new SqlLoadProcessedDocument(dataSource); + var url = new EdgeUrl("https://www.marginalia.nu/"); + + loader.load(loaderData, List.of(new LoadProcessedDocument( + url, + EdgeUrlState.OK, + "TITLE", + "DESCR", + HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)), + EdgeHtmlStandard.HTML5, + 100, + 12345, + -3.14 + ))); + + var details = dataStoreDao.getUrlDetailsMulti(List.of(new EdgeId<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/"))))); + assertEquals(1, details.size()); + + var urlDetails = details.get(0); + + assertEquals("TITLE", urlDetails.getTitle()); + assertEquals("DESCR", urlDetails.getDescription()); + assertTrue(urlDetails.isAffiliate()); + assertEquals(100, urlDetails.words); + assertEquals(12345, urlDetails.dataHash); + assertEquals(-3.14, urlDetails.getUrlQuality()); + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java new file mode 100644 index 00000000..eb66da92 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java @@ -0,0 +1,54 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +@Testcontainers +class SqlLoadProcessedDomainTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); + + HikariDataSource dataSource; + LoaderData loaderData; + @BeforeEach + public void setUp() { + + dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + + var loadDomains = new SqlLoadDomains(dataSource); + loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain[]{ new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); + } + + @AfterEach + public void tearDown() { + dataSource.close(); + } + + @Test + public void loadProcessedDomain() { + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1"); + } + @Test + public void loadDomainAlias() { + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu"))); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java new file mode 100644 index 00000000..5afac733 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java @@ -0,0 +1,50 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.net.URISyntaxException; + +@Testcontainers +class SqlLoadUrlsTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); + + HikariDataSource dataSource; + LoaderData loaderData; + @BeforeEach + public void setUp() { + dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + + var loadDomains = new SqlLoadDomains(dataSource); + loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); + } + + @AfterEach + public void tearDown() { + dataSource.close(); + } + + @Test + public void loadUrl() throws URISyntaxException { + var loadUrls = new SqlLoadUrls(dataSource); + loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") }); + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java index 180576fc..961d8304 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java @@ -1,11 +1,11 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java index 6b029da9..2b2da0fd 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java @@ -3,14 +3,14 @@ package nu.marginalia.wmsa.edge.index.service; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.client.exception.RemoteException; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.EdgeIndexService; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.EdgeId; @@ -23,7 +23,6 @@ import org.junit.jupiter.api.parallel.ResourceAccessMode; import org.junit.jupiter.api.parallel.ResourceLock; import spark.Spark; -import java.io.File; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; @@ -31,7 +30,6 @@ import java.util.List; import java.util.stream.Collectors; import static nu.marginalia.util.TestUtil.getConnection; -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java deleted file mode 100644 index f42f2d36..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java +++ /dev/null @@ -1,89 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; - -class SearchIndexConverterTest { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Test @Disabled @SneakyThrows - public void test() { - // File dictFile = new File("/home/vlofgren/dictionary.dat"); - File inFile = new File("/home/vlofgren/Work/converter/3/page-index.dat"); - - new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"), inFile, - new File("/home/vlofgren/Work/converter/words.dat"), - new File("/home/vlofgren/Work/converter/urls.dat"), new SearchIndexPartitioner(null), val -> false); - - // sanityCheck(); - } - - @Test @Disabled - public void sanityCheck() { - File inFile = new File("/home/vlofgren/write/6/page-index.dat"); - -// SearchIndexReader sir = new SearchIndexReader(new SearchIndex[]{ -// new SearchIndex("body", Path.of("/tmp"), -// new File("/home/vlofgren/data/urls.dat"), -// new File("/home/vlofgren/data/words.dat")), -// new SearchIndex("body", Path.of("/tmp"), -// new File("/home/vlofgren/data/urls.dat"), -// new File("/home/vlofgren/data/words.dat")) -// , -// new SearchIndex("body", Path.of("/tmp"), -// new File("/home/vlofgren/data/urls.dat"), -// new File("/home/vlofgren/data/words.dat")) -// , -// new SearchIndex("body", Path.of("/tmp"), -// new File("/home/vlofgren/data/urls.dat"), -// new File("/home/vlofgren/data/words.dat")) -// }); - -// getQuery(sir, new EdgeIndexSearchTerms(List.of(152, 106), Collections.emptyList())).stream().forEach(System.out::println); -// sir.findWord(152).also(106).stream().forEach(System.out::println); -// scanFile(inFile, (url, word) -> { -// //System.out.println(url + " " + word); -// if (!sir.findWord(word).stream().anyMatch(url::equals)) { -// logger.error("Can't find word {} in {}", word, url); -// } -// }); - - - } -/* - private SearchIndexReader.Query getQuery(SearchIndexReader indexReader, EdgeIndexSearchTerms searchTerms) { - var orderedIncludes = searchTerms.includes - .stream() - .sorted(Comparator.comparingLong(indexReader::numHits)) - .distinct() - .mapToInt(Integer::intValue) - .toArray(); - - logger.info("Includes: ({}); excludes: ({})", Arrays. - stream(orderedIncludes) - .mapToObj(String::valueOf) - .collect(Collectors.joining(",")), - searchTerms.excludes.stream().map(String::valueOf).collect(Collectors.joining(","))); - SearchIndexReader.Query query = indexReader.findWord(orderedIncludes[0]); - for (int i = 1; i < orderedIncludes.length; i++) { - query = query.also(orderedIncludes[i]); - } - for (int term : searchTerms.excludes) { - query = query.not(term); - } - return query; - } - -*/ -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java index 5f1d2a0f..edcfa71f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java @@ -1,14 +1,14 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.model.EdgeId; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java index ee84472e..e780ed62 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java @@ -1,6 +1,6 @@ package nu.marginalia.wmsa.edge.index.service; -import nu.marginalia.wmsa.edge.index.service.dictionary.TokenCompressor; +import nu.marginalia.wmsa.edge.index.dictionary.TokenCompressor; import org.junit.jupiter.api.Test; import java.util.Arrays; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index 4aa9bceb..65b1ad57 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -13,6 +13,7 @@ class QueryVariantsTest { QueryVariants variants; QueryParser parser; SentenceExtractor se; + @BeforeEach public void setUp() { LanguageModels lm = TestLanguageModels.getLanguageModels(); @@ -24,7 +25,7 @@ class QueryVariantsTest { parser = new QueryParser(new EnglishDictionary(dict), variants); } - @Test + @Test @SuppressWarnings("unchecked") void getQueryVariants() { System.out.println(se.extractSentence("we are alone")); testCase("DOS", List.of("DOS")); @@ -50,7 +51,5 @@ class QueryVariantsTest { private void testCase(String input, List... expected) { var tokens = variants.getQueryVariants(parser.extractBasicTokens(input)); System.out.println(tokens); -// var result = tokens.stream().map(lst -> lst.terms).collect(Collectors.toSet()); -// assertEquals(Set.of(expected), result, "Case failed: " + input); } } \ No newline at end of file From f76af4ca79e8d8b84deb2883824d69a568ba62b6 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 18 Jun 2022 15:54:58 +0200 Subject: [PATCH 03/40] Refactoring conversion --- .../java/nu/marginalia/util/ListChunker.java | 31 ++ .../nu/marginalia/util/RandomWriteFunnel.java | 50 +-- .../util/multimap/MultimapFileLong.java | 53 ++- .../multimap/MultimapFileLongOffsetSlice.java | 3 + .../util/multimap/MultimapFileLongSlice.java | 2 + .../wmsa/client/AbstractClient.java | 8 +- .../loader/SqlLoadProcessedDocument.java | 2 - .../CrawlJobExtractorPageRankMain.java | 6 +- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 15 +- .../data/dao/task/EdgeDomainBlacklist.java | 2 +- .../wmsa/edge/index/EdgeIndexBucket.java | 6 +- .../wmsa/edge/index/EdgeIndexControl.java | 6 +- .../wmsa/edge/index/EdgeIndexService.java | 59 ++- .../wmsa/edge/index/IndexServicesFactory.java | 27 +- .../conversion/SearchIndexConverter.java | 367 ++++++------------ .../conversion/SearchIndexPartitioner.java | 2 +- .../conversion/SearchIndexPreconverter.java | 65 +--- .../words/WordIndexOffsetsTable.java | 6 +- .../journal/SearchIndexJournalEntry.java | 49 +++ .../SearchIndexJournalEntryHeader.java | 16 + .../journal/SearchIndexJournalFileHeader.java | 4 + .../journal/SearchIndexJournalReader.java | 123 ++++++ .../journal/SearchIndexJournalWriter.java | 10 + ...java => SearchIndexJournalWriterImpl.java} | 36 +- .../edge/index/journal/SearchIndexWriter.java | 16 - .../wmsa/edge/index/reader/SearchIndexes.java | 8 +- .../nu/marginalia/wmsa/edge/model/EdgeId.java | 11 +- .../model/search/EdgeSearchResultItem.java | 2 +- .../wmsa/edge/search/EdgeSearchOperator.java | 2 +- .../command/commands/SiteSearchCommand.java | 4 +- .../search/results/SearchResultDecorator.java | 4 +- .../siteinfo/DomainInformationService.java | 21 +- .../index/service/EdgeIndexClientTest.java | 5 +- .../service/SearchIndexJournalWriterTest.java | 76 ++++ .../index/service/SearchIndexWriterTest.java | 90 ----- .../service/util/RandomWriteFunnelTest.java | 33 ++ .../com/upserve/uppend/blobs/NativeIO.java | 3 - 37 files changed, 658 insertions(+), 565 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/{SearchIndexWriterImpl.java => SearchIndexJournalWriterImpl.java} (68%) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java b/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java new file mode 100644 index 00000000..ef27ba1d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java @@ -0,0 +1,31 @@ +package nu.marginalia.util; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ListChunker { + + /** Chops data into a list of lists of max length size + * + * Caveat: Relies on subList and does not clone "data", so + * changes to the original list may affect the sub-lists + * in unspecified ways + * + * @see List#subList + */ + public static List> chopList(List data, int size) { + if (data.isEmpty()) + return Collections.emptyList(); + else if (data.size() < size) + return List.of(data); + + final List> ret = new ArrayList<>(1 + data.size() / size); + + for (int i = 0; i < data.size(); i+=size) { + ret.add(data.subList(i, Math.min(data.size(), i+size))); + } + + return ret; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java index 55c83464..0c274c2b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java @@ -1,6 +1,6 @@ package nu.marginalia.util; -import io.prometheus.client.Gauge; +import lombok.SneakyThrows; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,10 +18,6 @@ import java.nio.file.Path; * */ public class RandomWriteFunnel implements AutoCloseable { - private final static Gauge write_rate = Gauge.build("wmsa_rwf_write_bytes", "Bytes/s") - .register(); - private final static Gauge transfer_rate = Gauge.build("wmsa_rwf_transfer_bytes", "Bytes/s") - .register(); private static final Logger logger = LoggerFactory.getLogger(RandomWriteFunnel.class); private final DataBin[] bins; @@ -34,7 +30,7 @@ public class RandomWriteFunnel implements AutoCloseable { int binCount = (int) (size / binSize + ((size % binSize) != 0L ? 1 : 0)); bins = new DataBin[binCount]; for (int i = 0; i < binCount; i++) { - bins[i] = new DataBin(tempDir, (int) Math.min(size - binSize * i, binSize)); + bins[i] = new DataBin(tempDir, Math.min((int) (size - binSize * i), binSize)); } } else { @@ -42,25 +38,25 @@ public class RandomWriteFunnel implements AutoCloseable { } } - public void put(long address, long data) throws IOException { - bins[((int)(address / binSize))].put((int)(address%binSize), data); + @SneakyThrows + public void put(long address, long data) { + int bin = (int)(address / binSize); + int offset = (int)(address%binSize); + + bins[bin].put(offset, data); } public void write(FileChannel o) throws IOException { ByteBuffer buffer = ByteBuffer.allocateDirect(binSize*8); - logger.debug("Writing from RWF"); - for (int i = 0; i < bins.length; i++) { - var bin = bins[i]; + for (var bin : bins) { buffer.clear(); bin.eval(buffer); while (buffer.hasRemaining()) { - int wb = o.write(buffer); - write_rate.set(wb); + o.write(buffer); } } - logger.debug("Done"); } @Override @@ -84,12 +80,12 @@ public class RandomWriteFunnel implements AutoCloseable { } void put(int address, long data) throws IOException { - buffer.putInt(address); - buffer.putLong(data); - - if (buffer.capacity() - buffer.position() < 12) { + if (buffer.remaining() < 12) { flushBuffer(); } + + buffer.putInt(address); + buffer.putLong(data); } private void flushBuffer() throws IOException { @@ -97,12 +93,15 @@ public class RandomWriteFunnel implements AutoCloseable { return; buffer.flip(); - while (channel.write(buffer) > 0); + while (buffer.hasRemaining()) + channel.write(buffer); + buffer.clear(); } private void eval(ByteBuffer dest) throws IOException { flushBuffer(); + channel.force(false); channel.position(0); buffer.clear(); @@ -117,14 +116,17 @@ public class RandomWriteFunnel implements AutoCloseable { if (rb < 0) { break; } - else { - transfer_rate.set(rb); - } buffer.flip(); while (buffer.limit() - buffer.position() >= 12) { - int addr = buffer.getInt(); + int addr = 8 * buffer.getInt(); long data = buffer.getLong(); - dest.putLong(8*addr, data); + + try { + dest.putLong(addr, data); + } + catch (IndexOutOfBoundsException ex) { + logger.info("!!!bad[{}]={}", addr, data); + } } buffer.compact(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java index f381a977..e9a9b4fe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java @@ -36,9 +36,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { private long mappedSize; final static long WORD_SIZE = 8; - private boolean loadAggressively; - - private final NativeIO.Advice advice = null; + private NativeIO.Advice defaultAdvice = null; public static MultimapFileLong forReading(Path file) throws IOException { long fileSize = Files.size(file); @@ -70,12 +68,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { long mapSize, int bufferSize) throws IOException { - this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize, false); - } - - public MultimapFileLong loadAggressively(boolean v) { - this.loadAggressively = v; - return this; + this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize); } private static String translateToRAFMode(FileChannel.MapMode mode) { @@ -91,13 +84,11 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { public MultimapFileLong(RandomAccessFile file, FileChannel.MapMode mode, long mapSizeBytes, - int bufferSizeWords, - boolean loadAggressively) throws IOException { + int bufferSizeWords) throws IOException { this.mode = mode; this.bufferSize = bufferSizeWords; this.mapSize = mapSizeBytes; this.fileLength = file.length(); - this.loadAggressively = loadAggressively; channel = file.getChannel(); mappedSize = 0; @@ -115,6 +106,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { @SneakyThrows public void advice(NativeIO.Advice advice) { + this.defaultAdvice = advice; for (var buffer : mappedByteBuffers) { NativeIO.madvise(buffer, advice); } @@ -157,7 +149,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } @SneakyThrows - private void grow(long posIdxRequired) { + public void grow(long posIdxRequired) { if (posIdxRequired*WORD_SIZE > mapSize && mode == READ_ONLY) { throw new IndexOutOfBoundsException(posIdxRequired + " (max " + mapSize + ")"); } @@ -182,11 +174,8 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { var buffer = channel.map(mode, posBytes, bzBytes); - if (loadAggressively) - buffer.load(); - - if (advice != null) { - NativeIO.madvise(buffer, advice); + if (defaultAdvice != null) { + NativeIO.madvise(buffer, defaultAdvice); } buffers.add(buffer.asLongBuffer()); @@ -262,6 +251,32 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } + @Override + public void read(LongBuffer vals, long idx) { + int n = vals.limit() - vals.position(); + if (idx+n >= mappedSize) { + grow(idx+n); + } + int iN = (int)((idx + n) / bufferSize); + + for (int i = 0; i < n; ) { + int i0 = (int)((idx + i) / bufferSize); + + int bufferOffset = (int) ((idx+i) % bufferSize); + var buffer = buffers.get(i0); + + final int l; + + if (i0 < iN) l = bufferSize - bufferOffset; + else l = Math.min(n - i, bufferSize - bufferOffset); + + vals.put(vals.position() + i, buffer, bufferOffset, l); + i+=l; + } + + } + + @Override public void write(long[] vals, long idx) { write(vals, vals.length, idx); @@ -363,8 +378,10 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { @Override public void close() throws IOException { force(); + mappedByteBuffers.clear(); buffers.clear(); + channel.close(); // I want to believe diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java index c2630ddc..bd35bd9b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java @@ -38,6 +38,9 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice { map.read(vals, n, idx+off); } + @Override + public void read(LongBuffer vals, long idx) { map.read(vals, idx+off); } + @Override public void write(long[] vals, long idx) { map.write(vals, idx+off); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java index abf29f51..27d6ae06 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java @@ -15,6 +15,8 @@ public interface MultimapFileLongSlice { void read(long[] vals, int n, long idx); + void read(LongBuffer vals, long idx); + void write(long[] vals, long idx); void write(long[] vals, int n, long idx); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java index 5091b75e..603f57e5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java @@ -6,6 +6,7 @@ import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.core.ObservableSource; import io.reactivex.rxjava3.plugins.RxJavaPlugins; import lombok.SneakyThrows; +import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; import nu.marginalia.wmsa.client.exception.LocalException; import nu.marginalia.wmsa.client.exception.NetworkException; import nu.marginalia.wmsa.client.exception.RemoteException; @@ -30,9 +31,12 @@ import java.util.zip.GZIPOutputStream; public abstract class AbstractClient implements AutoCloseable { public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request"; - private final Gson gson = new GsonBuilder().create(); + + private final Gson gson = new GsonBuilder() + .registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create()) + .create(); + private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Marker httpMarker = MarkerFactory.getMarker("HTTP"); private final OkHttpClient client; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index e2e25fff..fb8a6303 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -94,8 +94,6 @@ public class SqlLoadProcessedDocument { } catch (SQLException ex) { logger.warn("SQL error inserting document", ex); } - - } public void loadWithError(LoaderData data, List documents) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java index ea1946fc..ef3bf39f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java @@ -105,7 +105,7 @@ public class CrawlJobExtractorPageRankMain { try (var domainQuery = conn.prepareStatement(specificDomainSqlFromId); var urlQuery = conn.prepareStatement(urlsSql)) { - domainQuery.setInt(1, domainId.getId()); + domainQuery.setInt(1, domainId.id()); ResultSet rsp = domainQuery.executeQuery(); domainName = rsp.next() ? rsp.getString(1) : ""; @@ -113,10 +113,10 @@ public class CrawlJobExtractorPageRankMain { spec.id = createId(new EdgeDomain(domainName)); spec.urls = new ArrayList<>(1000); - spec.crawlDepth = getCrawlDepth(new DomainWithId(domainName, domainId.getId())); + spec.crawlDepth = getCrawlDepth(new DomainWithId(domainName, domainId.id())); urlQuery.setString(1, domainName.toString()); - urlQuery.setInt(2, domainId.getId()); + urlQuery.setInt(2, domainId.id()); urlQuery.setFetchSize(1000); rsp = urlQuery.executeQuery(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 30ea2256..c73089b0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -17,13 +17,8 @@ import nu.marginalia.wmsa.edge.search.model.BrowseResult; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.sql.Connection; import java.sql.SQLException; -import java.sql.Types; import java.util.*; -import java.util.function.Function; -import java.util.regex.Pattern; -import java.util.stream.Collectors; public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { @@ -71,7 +66,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { private String idList(List> ids) { StringJoiner j = new StringJoiner(",", "(", ")"); for (var id : ids) { - j.add(Integer.toString(id.getId())); + j.add(Integer.toString(id.id())); } return j.toString(); } @@ -154,7 +149,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement(q)) { stmt.setFetchSize(count); - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); stmt.setInt(2, count); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -183,7 +178,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { try (var stmt = connection.prepareStatement(q2)) { stmt.setFetchSize(count/2); - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); stmt.setInt(2, count/2 - domains.size()); var rsp = stmt.executeQuery(); while (rsp.next() && domains.size() < count/2) { @@ -214,7 +209,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { LIMIT ?"""; try (var stmt = connection.prepareStatement(q3)) { stmt.setFetchSize(count/2); - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); stmt.setInt(2, count/2 - domains.size()); var rsp = stmt.executeQuery(); @@ -275,7 +270,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, id.getId()); + stmt.setInt(1, id.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { return new EdgeDomain(rsp.getString(1)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java index fa1899b1..df265a5f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java @@ -9,7 +9,7 @@ import nu.marginalia.wmsa.edge.model.EdgeId; public interface EdgeDomainBlacklist { boolean isBlacklisted(int domainId); default boolean isBlacklisted(EdgeId domainId) { - return isBlacklisted(domainId.getId()); + return isBlacklisted(domainId.id()); } default TIntHashSet getSpamDomains() { return new TIntHashSet(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java index 05bcfe75..09890252 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java @@ -1,11 +1,9 @@ package nu.marginalia.wmsa.edge.index; -import nu.marginalia.wmsa.edge.index.EdgeIndexControl; -import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriter; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.index.reader.query.Query; import org.jetbrains.annotations.NotNull; @@ -31,7 +29,7 @@ public class EdgeIndexBucket { @NotNull private final IndexServicesFactory servicesFactory; private final EdgeIndexControl indexControl; - private final SearchIndexWriter writer; + private final SearchIndexJournalWriter writer; private final int id; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index ab7c73fe..8df32c0a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -23,7 +23,7 @@ public class EdgeIndexControl { for (IndexBlock block : IndexBlock.values()) { try { - servicesFactory.getIndexConverter(id, block); + servicesFactory.convertIndex(id, block); System.runFinalization(); System.gc(); @@ -40,10 +40,6 @@ public class EdgeIndexControl { System.gc(); } - public long wordCount(int id) { - return servicesFactory.wordCount(id); - } - public void switchIndexFiles(int id) throws Exception { servicesFactory.switchFilesJob(id).call(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index de6276a8..829a59af 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -11,12 +11,17 @@ import gnu.trove.set.hash.TIntHashSet; import io.prometheus.client.Counter; import io.prometheus.client.Histogram; import io.reactivex.rxjava3.schedulers.Schedulers; +import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; +import nu.marginalia.util.ListChunker; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.model.*; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.edge.model.*; @@ -48,8 +53,11 @@ public class EdgeIndexService extends Service { @NotNull private final Initialization init; private final SearchIndexes indexes; + private final DictionaryWriter dictionaryWriter; - private final Gson gson = new GsonBuilder().create(); + private final Gson gson = new GsonBuilder() + .registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create()) + .create(); private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").help("-").register(); @@ -66,12 +74,13 @@ public class EdgeIndexService extends Service { @Named("service-port") Integer port, Initialization init, MetricsServer metricsServer, - SearchIndexes indexes - ) { + SearchIndexes indexes, + IndexServicesFactory servicesFactory) { super(ip, port, init, metricsServer); this.init = init; this.indexes = indexes; + this.dictionaryWriter = servicesFactory.getDictionaryWriter(); Spark.post("/words/", this::putWords); Spark.post("/search/", this::search, gson::toJson); @@ -173,29 +182,19 @@ public class EdgeIndexService extends Service { public void putWords(EdgeId domainId, EdgeId urlId, EdgePageWords words, int idx ) { - SearchIndexWriterImpl indexWriter = indexes.getIndexWriter(idx); + SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); - if (!words.words.isEmpty()) { - if (words.size() < 1000) { - indexWriter.put(domainId, urlId, words.block, words.words); - } else { - chunks(words.words, 1000).forEach(chunk -> { - indexWriter.put(domainId, urlId, words.block, chunk); - }); - } - } + for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) { + + var entry = new SearchIndexJournalEntry(getWordIds(chunk)); + var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block); + + indexWriter.put(header, entry); + }; } - - private List> chunks(Collection coll, int size) { - List> ret = new ArrayList<>(); - List data = List.copyOf(coll); - - for (int i = 0; i < data.size(); i+=size) { - ret.add(data.subList(i, Math.min(data.size(), i+size))); - } - - return ret; + private long[] getWordIds(List words) { + return words.stream().filter(w -> w.length() < Byte.MAX_VALUE).mapToLong(dictionaryWriter::get).toArray(); } private Object search(Request request, Response response) { @@ -341,7 +340,7 @@ public class EdgeIndexService extends Service { getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms) .mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id)) - .filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri)) + .filter(ri -> !seenResults.contains(ri.url.id()) && localFilter.test(i, domainCountFilter, ri)) .limit(specs.limitTotal * 3L) .distinct() .limit(Math.min(specs.limitByBucket @@ -350,7 +349,7 @@ public class EdgeIndexService extends Service { for (var result : resultsForBucket) { - seenResults.add(result.url.getId()); + seenResults.add(result.url.id()); } for (var result : resultsForBucket) { for (var searchTerm : sq.searchTermsInclude) { @@ -401,7 +400,7 @@ public class EdgeIndexService extends Service { public boolean filterRawValue(int bucket, long value) { var domain = new EdgeId((int)(value >>> 32)); - if (domain.getId() == Integer.MAX_VALUE) { + if (domain.id() == Integer.MAX_VALUE) { return true; } @@ -409,11 +408,11 @@ public class EdgeIndexService extends Service { } long getKey(int bucket, EdgeId id) { - return ((long)bucket) << 32 | id.getId(); + return ((long)bucket) << 32 | id.id(); } public boolean test(int bucket, EdgeSearchResultItem item) { - if (item.domain.getId() == Integer.MAX_VALUE) { + if (item.domain.id() == Integer.MAX_VALUE) { return true; } @@ -431,7 +430,7 @@ public class EdgeIndexService extends Service { } public boolean test(int bucket, DomainResultCountFilter root, EdgeSearchResultItem item) { - if (item.domain.getId() == Integer.MAX_VALUE) { + if (item.domain.id() == Integer.MAX_VALUE) { return true; } return root.getCount(bucket, item) + resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index 61e64b41..40c733e2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -8,7 +8,7 @@ import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; @@ -44,7 +44,8 @@ public class IndexServicesFactory { private final DoublePartitionedDataFile indexWriteUrlsFile; private volatile static DictionaryWriter dictionaryWriter; private final Long dictionaryHashMapSize; - private final SearchIndexPartitioner partitoner; + private final SearchIndexPartitioner partitioner; + @Inject public IndexServicesFactory( @Named("tmp-file-dir") Path tmpFileDir, @@ -59,7 +60,7 @@ public class IndexServicesFactory { @Named("edge-index-write-urls-file") String indexWriteUrlsFile, @Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize, EdgeDomainBlacklist domainBlacklist, - SearchIndexPartitioner partitoner + SearchIndexPartitioner partitioner ) { this.tmpFileDir = tmpFileDir; @@ -73,11 +74,11 @@ public class IndexServicesFactory { this.indexWriteWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteWordsFile); this.indexWriteUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteUrlsFile); this.preconverterOutputFile = new PartitionedDataFile(partitionRootSlowTmp, "preconverted.dat"); - this.partitoner = partitoner; + this.partitioner = partitioner; } - public SearchIndexWriterImpl getIndexWriter(int idx) { - return new SearchIndexWriterImpl(getDictionaryWriter(), writerIndexFile.get(idx)); + public SearchIndexJournalWriterImpl getIndexWriter(int idx) { + return new SearchIndexJournalWriterImpl(getDictionaryWriter(), writerIndexFile.get(idx)); } public DictionaryWriter getDictionaryWriter() { @@ -93,15 +94,17 @@ public class IndexServicesFactory { } - public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { - return new SearchIndexConverter(block, id, tmpFileDir, + public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { + var converter = new SearchIndexConverter(block, id, tmpFileDir, preconverterOutputFile.get(id), indexWriteWordsFile.get(id, block.id), indexWriteUrlsFile.get(id, block.id), - partitoner, + partitioner, domainBlacklist ); + converter.convert(); } + @SneakyThrows public SearchIndexPreconverter getIndexPreconverter() { File[] outputFiles = new File[DYNAMIC_BUCKET_LENGTH+1]; @@ -110,7 +113,7 @@ public class IndexServicesFactory { } return new SearchIndexPreconverter(writerIndexFile.get(0), outputFiles, - partitoner, + partitioner, domainBlacklist ); } @@ -119,10 +122,6 @@ public class IndexServicesFactory { return preconverterOutputFile.get(i); } - public long wordCount(int id) { - return SearchIndexConverter.wordCount(writerIndexFile.get(0)); - } - @SneakyThrows public SearchIndexReader getIndexReader(int id) { EnumMap indexMap = new EnumMap<>(IndexBlock.class); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java index 0827b4e7..2d12d0f4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java @@ -1,331 +1,222 @@ package nu.marginalia.wmsa.edge.index.conversion; -import com.google.inject.Inject; -import com.google.inject.name.Named; -import gnu.trove.set.hash.TIntHashSet; -import lombok.RequiredArgsConstructor; -import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter; import nu.marginalia.util.btree.BTreeWriter; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.RandomWriteFunnel; -import nu.marginalia.util.multimap.MultimapSorter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; -import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; -import java.util.concurrent.locks.Lock; + +import static nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry.MAX_LENGTH; public class SearchIndexConverter { - private static final long FILE_HEADER_SIZE = 12; - private static final int CHUNK_HEADER_SIZE = 16; - public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8); - private final long fileLength; - private final long urlsFileSize; + private final long[] tmpWordsBuffer = new long[MAX_LENGTH]; + private final Path tmpFileDir; - private final FileChannel urlsTmpFileChannel; - private final int wordCount; - private final MultimapFileLong urlsTmpFileMap; private final Logger logger = LoggerFactory.getLogger(getClass()); private final IndexBlock block; private final int bucketId; + private final File inputFile; + private final File outputFileWords; + private final File outputFileUrls; - private final File urlsFile; private final SearchIndexPartitioner partitioner; - private final TIntHashSet spamDomains; - private final MultimapSorter urlTmpFileSorter; + private final EdgeDomainBlacklist blacklist; private final static int internalSortLimit = Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256; - @SneakyThrows - public static long wordCount(File inputFile) { - try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) { - raf.readLong(); - return raf.readInt(); - } - } - - @Inject public SearchIndexConverter(IndexBlock block, - int bucketId, @Named("tmp-file-dir") Path tmpFileDir, - @Named("edge-writer-page-index-file") File inputFile, - @Named("edge-index-write-words-file") File outputFileWords, - @Named("edge-index-write-urls-file") File outputFileUrls, + int bucketId, + Path tmpFileDir, + File inputFile, + File outputFileWords, + File outputFileUrls, SearchIndexPartitioner partitioner, EdgeDomainBlacklist blacklist) - throws ConversionUnnecessaryException, IOException { this.block = block; this.bucketId = bucketId; this.tmpFileDir = tmpFileDir; - this.urlsFile = outputFileUrls; + this.inputFile = inputFile; + this.outputFileWords = outputFileWords; + this.outputFileUrls = outputFileUrls; this.partitioner = partitioner; - this.spamDomains = blacklist.getSpamDomains(); - - logger.info("Converting {} ({}) {}", block.id, block, inputFile); + this.blacklist = blacklist; + } + public void convert() throws IOException { Files.deleteIfExists(outputFileWords.toPath()); Files.deleteIfExists(outputFileUrls.toPath()); - final RandomAccessFile raf = new RandomAccessFile(inputFile, "r"); + SearchIndexJournalReader journalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath())); - this.fileLength = raf.readLong(); - this.wordCount = raf.readInt(); - - if (fileLength <= FILE_HEADER_SIZE) { - throw new ConversionUnnecessaryException(); + if (journalReader.fileHeader.fileSize() <= SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES) { + return; } - var inputChannel = raf.getChannel(); + logger.info("Converting {} ({}) {} {}", block.id, block, inputFile, journalReader.fileHeader); - ByteBuffer buffer = ByteBuffer.allocateDirect(10_000); + var lock = partitioner.getReadLock(); + try { + lock.lock(); - urlsFileSize = getUrlsSize(buffer, inputChannel); + var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); - urlsTmpFileChannel = urlsTmpFileRaf.getChannel(); - urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false); - urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); + logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block); + WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords); - logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block); - WordIndexOffsetsTable wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); + logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block); + createUrlTable(journalReader, tmpUrlsFile, wordIndexTable); - logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block); - createUrlTable(buffer, raf, wordIndexTable); - - Files.delete(tmpUrlsFile); - raf.close(); - - urlsTmpFileChannel.close(); - urlsTmpFileMap.force(); - - } - - private boolean isUrlAllowed(long url) { - return !spamDomains.contains((int)(url >>> 32)); - } - - public long translateUrl(long url) { - int domainId = partitioner.translateId(bucketId, (int) (url >>> 32)); - return ((long)domainId << 32) | (url & 0xFFFFFFFFL); - } - - - private long getUrlsSize(ByteBuffer buffer, FileChannel channel) throws IOException { - channel.position(FILE_HEADER_SIZE); - - var reader = new IndexReader(buffer, channel) { - public long size; - - @Override - public void eachWord(long urlId, int wordId) { - size++; - } - }; - - reader.read(); - - logger.info("Blacklist filtered {} URLs", reader.filtered); - logger.debug("URLs Size {} Mb", channel.position()/(1024*1024)); - - return reader.size; - } - - private void createUrlTable(ByteBuffer buffer, RandomAccessFile raf, WordIndexOffsetsTable wordOffsetsTable) throws IOException { - logger.info("Table size = {}", wordOffsetsTable.length()); - - raf.seek(FILE_HEADER_SIZE); - - var channel = raf.getChannel(); - - try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) { - int[] wordWriteOffset = new int[wordOffsetsTable.length()]; - - new IndexReader(buffer, channel) { - @Override - public void eachWord(long urlId, int wordId) throws IOException { - if (wordId >= wordWriteOffset.length) - return; - - if (wordId > 0) { - rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, translateUrl(urlId)); - } else { - rwf.put(wordWriteOffset[wordId]++, translateUrl(urlId)); - } - } - }.read(); - - rwf.write(urlsTmpFileChannel); + Files.delete(tmpUrlsFile); } - - urlsTmpFileChannel.force(false); - logger.info("URL TMP Table: {} Mb", channel.position()/(1024*1024)); - - if (wordOffsetsTable.length() > 0) { - logger.info("Sorting urls table"); - - wordOffsetsTable.forEach(urlTmpFileSorter::sort); - - urlsTmpFileMap.force(); - } - else { - logger.warn("urls table empty -- nothing to sort"); - } - - logger.info("Writing BTree"); - try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) { - var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); - - wordOffsetsTable.fold((accumulatorIdx, start, length) -> { - // Note: The return value is accumulated into accumulatorIdx! - - return writer.write(accumulatorIdx, length, - slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length)); - }); - - } catch (Exception e) { - logger.error("Error while writing BTree", e); + finally { + lock.unlock(); } } - private WordIndexOffsetsTable createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws IOException { - inputChannel.position(FILE_HEADER_SIZE); - logger.debug("Table size = {}", wordCount); - WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount); - ByteBuffer buffer = ByteBuffer.allocateDirect(8* SearchIndexWriterImpl.MAX_BLOCK_SIZE); + + private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader, + File outputFileWords) throws IOException + { + final int topWord = (int) journalReader.fileHeader.wordCount(); + + logger.debug("Table size = {}", topWord); + WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord); logger.debug("Reading words"); - var reader = new IndexReader(buffer, inputChannel) { - @Override - public void eachWord(long urlId, int wordId) { + for (var entry : journalReader) { + if (!isRelevantEntry(entry)) { + continue; + } + + final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); + + for (int i = 0; i < entryData.size(); i++) { + int wordId = (int) entryData.get(i); + if (wordId < 0 || wordId >= topWord) { + logger.warn("Bad wordId {}", wordId); + } wordsTableWriter.acceptWord(wordId); } - }; - reader.read(); + } logger.debug("Rearranging table"); - inputChannel.position(FILE_HEADER_SIZE); - wordsTableWriter.write(outputFileWords); return wordsTableWriter.getTable(); } - @RequiredArgsConstructor - private class IndexReader { - private final ByteBuffer buffer; - private final FileChannel channel; - public long filtered; + private void createUrlTable(SearchIndexJournalReader journalReader, + Path tmpUrlsFile, + WordIndexOffsetsTable wordOffsetsTable) throws IOException + { + logger.info("Table size = {}", wordOffsetsTable.length()); - public void read() throws IOException { - var lock = partitioner.getReadLock(); - try { - lock.lock(); - outer: - while (channel.position() < fileLength) { - buffer.clear(); - buffer.limit(CHUNK_HEADER_SIZE); - channel.read(buffer); - buffer.flip(); - long urlId = buffer.getLong(); - int chunkBlock = buffer.getInt(); - int count = buffer.getInt(); + long numberOfWordsTotal = 0; + for (var entry : journalReader) { + if (isRelevantEntry(entry)) + numberOfWordsTotal += entry.wordCount(); + } - if (count > 1000) { - int tries = 0; - logger.warn("Terminating garbage @{}b, attempting repair", channel.position()); + try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); + FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) { - for (; ; ) { - tries++; - long p = channel.position(); - buffer.clear(); - buffer.limit(8); - if (channel.read(buffer) != 8) { - break outer; // EOF...? - } + try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, numberOfWordsTotal, 10_000_000)) { + int[] wordWriteOffset = new int[wordOffsetsTable.length()]; - buffer.flip(); - int pcb = buffer.getInt(); - int pct = buffer.getInt(); - if (pcb == 0 || pcb == 1 && pct >= 0 && pct <= 1000) { - chunkBlock = pcb; - count = pct; - break; - } else { - channel.position(p + 1); - } + for (var entry : journalReader) { + if (!isRelevantEntry(entry)) continue; + + var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); + + for (int i = 0; i < entryData.size(); i++) { + int wordId = (int) entryData.get(i); + + if (wordId >= wordWriteOffset.length) + continue; + if (wordId < 0) { + logger.warn("Negative wordId {}", wordId); } - logger.warn("Skipped {}b", tries); - } - buffer.clear(); - buffer.limit(count * 4); - - int trb = 0; - while (trb < count * 4) { - int rb = channel.read(buffer); - if (rb <= 0) { - throw new ArrayIndexOutOfBoundsException(trb + " - " + count * 4 + " " + rb); + final long urlInternal = translateUrl(entry.docId()); + if (wordId > 0) { + rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, urlInternal); + } else { + rwf.put(wordWriteOffset[wordId]++, urlInternal); } - trb += rb; - } - - buffer.flip(); - - if (isUrlAllowed(urlId)) { - if (block.id == chunkBlock) { - eachUrl(lock, count, urlId); - } - } else { - filtered++; } } - } - finally { - lock.unlock(); - } - } - public void eachUrl(Lock lock, int count, long urlId) throws IOException { - for (int i = 0; i < count; i++) { - int wordId = buffer.getInt(); - if (acceptWord(lock, urlId)) { - eachWord(urlId, wordId); + + rwf.write(urlsTmpFileChannel); + } + + urlsTmpFileChannel.force(false); + + try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) { + if (wordOffsetsTable.length() > 0) { + logger.info("Sorting urls table"); + + var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); + + wordOffsetsTable.forEachRange(urlTmpFileSorter::sort); + + urlsTmpFileMap.force(); + } else { + logger.warn("urls table empty -- nothing to sort"); } } - } - public void eachWord(long urlId, int wordId) throws IOException { - } + logger.info("Writing BTree"); + try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) { + var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); - boolean acceptWord(Lock lock, long urlId) { - int domainId = (int) (urlId >>> 32L); + wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> { + // Note: The return value is accumulated into accumulatorIdx! - if (!partitioner.filterUnsafe(lock, domainId, bucketId)) { - return false; + return writer.write(accumulatorIdx, length, + slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length)); + }); + + } catch (Exception e) { + logger.error("Error while writing BTree", e); } - return true; } } + + + private long translateUrl(long url) { + int domainId = partitioner.translateId(bucketId, (int) (url >>> 32)); + return ((long)domainId << 32) | (url & 0xFFFFFFFFL); + } + + private boolean isRelevantEntry(SearchIndexJournalReader.JournalEntry entry) { + return block.equals(entry.header.block()) + && !blacklist.isBlacklisted(entry.domainId()) + && partitioner.filterUnsafe(entry.domainId(), bucketId); + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java index bf5a1d74..2f2e9d47 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java @@ -122,7 +122,7 @@ public class SearchIndexPartitioner { public Lock getReadLock() { return rwl.readLock(); } - public boolean filterUnsafe(Lock lock, int domainId, int bucketId) { + public boolean filterUnsafe(int domainId, int bucketId) { return partitionSet.test(domainId, bucketId); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java index 9e851025..5357fc1f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java @@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; +import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,23 +48,16 @@ public class SearchIndexPreconverter { } } - final RandomAccessFile raf = new RandomAccessFile(inputFile, "r"); + SearchIndexJournalReader indexJournalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath())); - var fileLength = raf.readLong(); - var wordCount = raf.readInt(); - final int wordCountOriginal = wordCount; + final long wordCountOriginal = indexJournalReader.fileHeader.wordCount(); - logger.info("Word Count: {}", wordCount); - logger.info("File Length: {}", fileLength); - - var channel = raf.getChannel(); - - ByteBuffer inByteBuffer = ByteBuffer.allocateDirect(10_000); + logger.info("{}", indexJournalReader.fileHeader); RandomAccessFile[] randomAccessFiles = new RandomAccessFile[outputFiles.length]; for (int i = 0; i < randomAccessFiles.length; i++) { randomAccessFiles[i] = new RandomAccessFile(outputFiles[i], "rw"); - randomAccessFiles[i].seek(12); + randomAccessFiles[i].seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES); } FileChannel[] fileChannels = new FileChannel[outputFiles.length]; for (int i = 0; i < fileChannels.length; i++) { @@ -73,33 +68,24 @@ public class SearchIndexPreconverter { var lock = partitioner.getReadLock(); try { lock.lock(); + ByteBuffer buffer = ByteBuffer.allocateDirect(8192); - while (channel.position() < fileLength) { - inByteBuffer.clear(); - inByteBuffer.limit(CHUNK_HEADER_SIZE); - channel.read(inByteBuffer); - inByteBuffer.flip(); - long urlId = inByteBuffer.getLong(); - int chunkBlock = inByteBuffer.getInt(); - int count = inByteBuffer.getInt(); - // inByteBuffer.clear(); - inByteBuffer.limit(count * 4 + CHUNK_HEADER_SIZE); - channel.read(inByteBuffer); - inByteBuffer.position(CHUNK_HEADER_SIZE); - - for (int i = 0; i < count; i++) { - wordCount = Math.max(wordCount, 1 + inByteBuffer.getInt()); + for (var entry : indexJournalReader) { + if (!partitioner.isGoodUrl(entry.urlId()) + || spamDomains.contains(entry.domainId())) { + continue; } - inByteBuffer.position(count * 4 + CHUNK_HEADER_SIZE); + int domainId = entry.domainId(); + buffer.clear(); + entry.copyToBuffer(buffer); + for (int i = 0; i < randomAccessFiles.length; i++) { + if (partitioner.filterUnsafe(domainId, i)) { + buffer.flip(); - if (isUrlAllowed(urlId)) { - for (int i = 0; i < randomAccessFiles.length; i++) { - if (partitioner.filterUnsafe(lock, (int) (urlId >>> 32L), i)) { - inByteBuffer.flip(); - fileChannels[i].write(inByteBuffer); - } + while (buffer.position() < buffer.limit()) + fileChannels[i].write(buffer); } } } @@ -108,27 +94,16 @@ public class SearchIndexPreconverter { lock.unlock(); } - if (wordCountOriginal < wordCount) { - logger.warn("Raised word count {} => {}", wordCountOriginal, wordCount); - } - for (int i = 0; i < randomAccessFiles.length; i++) { long pos = randomAccessFiles[i].getFilePointer(); randomAccessFiles[i].seek(0); randomAccessFiles[i].writeLong(pos); - randomAccessFiles[i].writeInt(wordCount); + randomAccessFiles[i].writeLong(wordCountOriginal); fileChannels[i].force(true); fileChannels[i].close(); randomAccessFiles[i].close(); } } - private boolean isUrlAllowed(long url) { - int urlId = (int)(url & 0xFFFF_FFFFL); - int domainId = (int)(url >>> 32); - - return partitioner.isGoodUrl(urlId) && !spamDomains.contains(domainId); - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java index 29b88509..f1308d6e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java @@ -16,7 +16,7 @@ public class WordIndexOffsetsTable { return table.length; } - public void forEach(OffsetTableEntryConsumer o) throws IOException { + public void forEachRange(OffsetTableEntryConsumer o) throws IOException { if (table[0] > 0) { o.accept(0, (int) table[0]); } @@ -32,9 +32,9 @@ public class WordIndexOffsetsTable { } /** - * Fold over each span in the file, left to right + * Fold over each span in the file, left to right, accumulating the return value */ - public long fold(OffsetTableEntryFoldConsumer o) throws IOException { + public long foldRanges(OffsetTableEntryFoldConsumer o) throws IOException { long total = 0; if (table[0] > 0) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java new file mode 100644 index 00000000..493eea40 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java @@ -0,0 +1,49 @@ +package nu.marginalia.wmsa.edge.index.journal; + +import java.nio.ByteBuffer; +import java.util.Arrays; + +public class SearchIndexJournalEntry { + private final int size; + private final long[] underlyingArray; + + public static final int MAX_LENGTH = 1000; + + public SearchIndexJournalEntry(long[] underlyingArray) { + this.size = underlyingArray.length; + this.underlyingArray = underlyingArray; + } + + public SearchIndexJournalEntry(int size, long[] underlyingArray) { + this.size = size; + this.underlyingArray = underlyingArray; + } + + public void write(ByteBuffer buffer) { + for (int i = 0; i < size; i++) { + buffer.putLong(underlyingArray[i]); + } + } + + public long get(int idx) { + if (idx >= size) + throw new ArrayIndexOutOfBoundsException(); + return underlyingArray[idx]; + } + + public int size() { + return size; + } + + public long[] toArray() { + if (size == underlyingArray.length) + return underlyingArray; + else + return Arrays.copyOf(underlyingArray, size); + } + + public String toString() { + return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray())); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java new file mode 100644 index 00000000..f635b1d4 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java @@ -0,0 +1,16 @@ +package nu.marginalia.wmsa.edge.index.journal; + +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.EdgeUrl; + +public record SearchIndexJournalEntryHeader(int entrySize, long documentId, IndexBlock block) { + + public static final int HEADER_SIZE_LONGS = 2; + + public SearchIndexJournalEntryHeader( EdgeId domainId, EdgeId urlId, IndexBlock block) { + this(-1, (long) domainId.id() << 32 | urlId.id(), block); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java new file mode 100644 index 00000000..49ac5009 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java @@ -0,0 +1,4 @@ +package nu.marginalia.wmsa.edge.index.journal; + +public record SearchIndexJournalFileHeader(long fileSize, long wordCount) { +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java new file mode 100644 index 00000000..0e11646a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java @@ -0,0 +1,123 @@ +package nu.marginalia.wmsa.edge.index.journal; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import org.jetbrains.annotations.NotNull; + +import java.nio.ByteBuffer; +import java.util.Iterator; + +import static nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; + +public class SearchIndexJournalReader implements Iterable { + public static final long FILE_HEADER_SIZE_LONGS = 2; + public static final long FILE_HEADER_SIZE_BYTES = 8*FILE_HEADER_SIZE_LONGS; + + public final SearchIndexJournalFileHeader fileHeader; + + private final MultimapFileLongSlice map; + private final long committedSize; + + public SearchIndexJournalReader(MultimapFileLong map) { + fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1)); + committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS; + + map.advice(NativeIO.Advice.Sequential); + + this.map = map.atOffset(FILE_HEADER_SIZE_LONGS); + } + + @NotNull + @Override + public Iterator iterator() { + return new JournalEntryIterator(); + } + + private class JournalEntryIterator implements Iterator { + private JournalEntry entry; + + @Override + public boolean hasNext() { + if (entry == null) { + return committedSize > 0; + } + + return entry.hasNext(); + } + + @Override + public JournalEntry next() { + if (entry == null) { + entry = new JournalEntry(0); + } + else { + entry = entry.next(); + } + return entry; + } + } + + public class JournalEntry { + private final long offset; + public final SearchIndexJournalEntryHeader header; + + JournalEntry(long offset) { + final long sizeBlock = map.get(offset); + final long docId = map.get(offset + 1); + + this.offset = offset; + this.header = new SearchIndexJournalEntryHeader( + (int)(sizeBlock >>> 32L), + docId, + IndexBlock.byId((int)(sizeBlock & 0xFFFF_FFFFL))); + } + + public boolean hasNext() { + return nextId() < committedSize; + } + public long docId() { + return header.documentId(); + } + public int domainId() { + return (int) (docId() >>> 32L); + } + public int urlId() { + return (int)(docId() & 0xFFFF_FFFFL); + } + public IndexBlock block() { + return header.block(); + } + public int wordCount() { return header.entrySize(); } + + public SearchIndexJournalEntry readEntry() { + long[] dest = new long[header.entrySize()]; + map.read(dest, offset + HEADER_SIZE_LONGS); + return new SearchIndexJournalEntry(header.entrySize(), dest); + } + + public SearchIndexJournalEntry readEntryUsingBuffer(long[] dest) { + if (dest.length >= header.entrySize()) { + map.read(dest, header.entrySize(), offset + HEADER_SIZE_LONGS); + return new SearchIndexJournalEntry(header.entrySize(), dest); + } + else { + return readEntry(); + } + } + + public long nextId() { + return offset + HEADER_SIZE_LONGS + header.entrySize(); + } + public JournalEntry next() { return new JournalEntry(nextId()); } + + public void copyToBuffer(ByteBuffer buffer) { + var dest = buffer.asLongBuffer(); + dest.position(buffer.position() * 8); + dest.limit(buffer.position()*8 + header.entrySize() + HEADER_SIZE_LONGS); + map.read(dest, offset); + buffer.position(dest.limit()*8); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java new file mode 100644 index 00000000..4567a428 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java @@ -0,0 +1,10 @@ +package nu.marginalia.wmsa.edge.index.journal; + +public interface SearchIndexJournalWriter { + void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry); + + void forceWrite(); + + void flushWords(); + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java index cf76ada2..f5ba8b31 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java @@ -3,11 +3,7 @@ package nu.marginalia.wmsa.edge.index.journal; import io.reactivex.rxjava3.disposables.Disposable; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,10 +13,9 @@ import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; -import java.util.List; import java.util.concurrent.TimeUnit; -public class SearchIndexWriterImpl implements SearchIndexWriter { +public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { private final DictionaryWriter dictionaryWriter; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -28,12 +23,12 @@ public class SearchIndexWriterImpl implements SearchIndexWriter { private RandomAccessFile raf; private FileChannel channel; - public static final int MAX_BLOCK_SIZE = 1000*32*8*4; + public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*32*8*4; private final ByteBuffer byteBuffer; private long pos; @SneakyThrows - public SearchIndexWriterImpl(DictionaryWriter dictionaryWriter, File indexFile) { + public SearchIndexJournalWriterImpl(DictionaryWriter dictionaryWriter, File indexFile) { this.dictionaryWriter = dictionaryWriter; initializeIndexFile(indexFile); @@ -61,23 +56,16 @@ public class SearchIndexWriterImpl implements SearchIndexWriter { @Override @SneakyThrows - public synchronized void put(EdgeId domainId, EdgeId urlId, IndexBlock block, List wordsSuspect) { - int numGoodWords = 0; - for (String word : wordsSuspect) { - if (word.length() < Byte.MAX_VALUE) numGoodWords++; - } + public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) { byteBuffer.clear(); - long url_id = ((long) domainId.getId() << 32) | urlId.getId(); - byteBuffer.putLong(url_id); - byteBuffer.putInt(block.id); - byteBuffer.putInt(numGoodWords); - for (String word : wordsSuspect) { - if (word.length() < Byte.MAX_VALUE) { - byteBuffer.putInt(dictionaryWriter.get(word)); - } - } + byteBuffer.putInt(entryData.size()); + byteBuffer.putInt(header.block().id); + byteBuffer.putLong(header.documentId()); + + entryData.write(byteBuffer); + byteBuffer.limit(byteBuffer.position()); byteBuffer.rewind(); @@ -104,11 +92,11 @@ public class SearchIndexWriterImpl implements SearchIndexWriter { } private void writePositionMarker() throws IOException { - var lock = channel.lock(0, 12, false); + var lock = channel.lock(0, 16, false); pos = channel.size(); raf.seek(0); raf.writeLong(pos); - raf.writeInt(dictionaryWriter.size()); + raf.writeLong(dictionaryWriter.size()); raf.seek(pos); lock.release(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java deleted file mode 100644 index 11fc186a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java +++ /dev/null @@ -1,16 +0,0 @@ -package nu.marginalia.wmsa.edge.index.journal; - -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.EdgeUrl; - -import java.util.List; - -public interface SearchIndexWriter { - void put(EdgeId domainId, EdgeId urlId, IndexBlock block, List words); - void forceWrite(); - - void flushWords(); - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java index 863c0c65..01ad1e20 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java @@ -7,7 +7,7 @@ import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,8 +27,8 @@ public class SearchIndexes { private final ReentrantLock opsLock = new ReentrantLock(false); - private final SearchIndexWriterImpl primaryIndexWriter; - private final SearchIndexWriterImpl secondaryIndexWriter; + private final SearchIndexJournalWriterImpl primaryIndexWriter; + private final SearchIndexJournalWriterImpl secondaryIndexWriter; private DictionaryReader dictionaryReader = null; @Inject @@ -134,7 +134,7 @@ public class SearchIndexes { } } - public SearchIndexWriterImpl getIndexWriter(int idx) { + public SearchIndexJournalWriterImpl getIndexWriter(int idx) { if (idx == 0) { return primaryIndexWriter; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeId.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeId.java index f2be15fa..0ee908ef 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeId.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeId.java @@ -1,15 +1,10 @@ package nu.marginalia.wmsa.edge.model; -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.ToString; -/** This exists entirely for strengthening the typing of IDs +/** + * This exists entirely for strengthening the typing of IDs * * @param */ -@AllArgsConstructor @Getter @EqualsAndHashCode @ToString -public class EdgeId { - private final int id; +public record EdgeId(int id) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java index c6f4fbc5..66438279 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java @@ -32,7 +32,7 @@ public class EdgeSearchResultItem { } public long getCombinedId() { - return ((long) domain.getId() << 32L) | url.getId(); + return ((long) domain.id() << 32L) | url.id(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index 66004dee..add46ef4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -121,7 +121,7 @@ public class EdgeSearchOperator { int domainId = -1; try { if (domain != null) { - return edgeDataStoreDao.getDomainId(new EdgeDomain(domain)).getId(); + return edgeDataStoreDao.getDomainId(new EdgeDomain(domain)).id(); } } catch (NoSuchElementException ex) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java index 6e341721..193f1a1c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java @@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.search.command.commands; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; @@ -12,7 +11,6 @@ import nu.marginalia.wmsa.edge.search.command.ResponseType; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; -import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; import nu.marginalia.wmsa.edge.search.model.DomainInformation; import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; @@ -69,7 +67,7 @@ public class SiteSearchCommand implements SearchCommandInterface { if (null != domain) { resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words, 100, 100, "site:"+domain); - screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).getId()); + screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id()); } else { resultSet = new DecoratedSearchResultSet(Collections.emptyList()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java index 22b24aca..12d358bf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java @@ -78,8 +78,8 @@ public class SearchResultDecorator { TIntArrayList missedIds = new TIntArrayList(); for (var resultItem : resultItems) { - var did = resultItem.getDomain().getId(); - var uid = resultItem.getUrl().getId(); + var did = resultItem.getDomain().id(); + var uid = resultItem.getUrl().id(); var details = detailsById.get(uid); if (details == null) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index 2f79a9ea..d3eb8061 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.search.siteinfo; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; @@ -98,7 +97,7 @@ public class DomainInformationService { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -115,7 +114,7 @@ public class DomainInformationService { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -133,7 +132,7 @@ public class DomainInformationService { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -150,7 +149,7 @@ public class DomainInformationService { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -166,7 +165,7 @@ public class DomainInformationService { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -183,7 +182,7 @@ public class DomainInformationService { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getDouble(1); @@ -199,7 +198,7 @@ public class DomainInformationService { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { return EdgeDomainIndexingState.valueOf(rsp.getString(1)); @@ -216,8 +215,8 @@ public class DomainInformationService { public List getLinkingDomains(EdgeId domainId) { try (var connection = dataSource.getConnection()) { List results = new ArrayList<>(25); - try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { - stmt.setInt(1, domainId.getId()); + try (var stmt = connection.prepareStatement("SELECT SOURCE_DOMAIN FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { + stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); while (rsp.next()) { results.add(new EdgeDomain(rsp.getString(1))); @@ -237,7 +236,7 @@ public class DomainInformationService { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); + stmt.setInt(1, domainId.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getDouble(1); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java index 2b2da0fd..55015d13 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java @@ -81,7 +81,8 @@ public class EdgeIndexClientTest { service = new EdgeIndexService("127.0.0.1", testPort, init, null, - indexes); + indexes, + servicesFactory); Spark.awaitInitialization(); init.setReady(); @@ -113,7 +114,7 @@ public class EdgeIndexClientTest { indexes.reindexAll(); var rsp = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("trapphus")); System.out.println(rsp); - assertEquals(5, rsp.resultsList.get(IndexBlock.Title).get(0).results.get(0).get(0).url.getId()); + assertEquals(5, rsp.resultsList.get(IndexBlock.Title).get(0).results.get(0).get(0).url.id()); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java new file mode 100644 index 00000000..39a62033 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java @@ -0,0 +1,76 @@ +package nu.marginalia.wmsa.edge.index.service; + +import lombok.SneakyThrows; +import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; + +class SearchIndexJournalWriterTest { + DictionaryWriter dictionaryWriter; + SearchIndexJournalWriterImpl writer; + + Path indexFile; + Path wordsFile1; + Path urlsFile1; + Path dictionaryFile; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @BeforeEach @SneakyThrows + void setUp() { + dictionaryFile = Files.createTempFile("tmp", ".dict"); + dictionaryFile.toFile().deleteOnExit(); + + dictionaryWriter = new DictionaryWriter(dictionaryFile.toFile(), 1L<<16, false); + + indexFile = Files.createTempFile("tmp", ".idx"); + indexFile.toFile().deleteOnExit(); + writer = new SearchIndexJournalWriterImpl(dictionaryWriter, indexFile.toFile()); + + wordsFile1 = Files.createTempFile("words1", ".idx"); + urlsFile1 = Files.createTempFile("urls1", ".idx"); + } + + @SneakyThrows + @AfterEach + void tearDown() { + dictionaryWriter.close(); + writer.close(); + indexFile.toFile().delete(); + dictionaryFile.toFile().delete(); + urlsFile1.toFile().delete(); + wordsFile1.toFile().delete(); + } + + @Test + void put() throws IOException { + writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link), + new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 })); + writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words), + new SearchIndexJournalEntry(new long[] { 5, 6, 7 })); + writer.forceWrite(); + + var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(indexFile)); + reader.forEach(entry -> { + logger.info("{}, {} {}", entry, entry.urlId(), entry.domainId()); + logger.info("{}", entry.readEntry().toArray()); + }); + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java deleted file mode 100644 index edcfa71f..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java +++ /dev/null @@ -1,90 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.model.EdgeId; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.EnumMap; - -import static nu.marginalia.util.dict.DictionaryHashMap.NO_VALUE; -import static org.junit.jupiter.api.Assertions.*; - -class SearchIndexWriterTest { - DictionaryWriter dictionaryWriter; - SearchIndexWriterImpl writer; - - Path indexFile; - Path wordsFile1; - Path urlsFile1; - Path dictionaryFile; - - @BeforeEach @SneakyThrows - void setUp() { - dictionaryFile = Files.createTempFile("tmp", ".dict"); - dictionaryFile.toFile().deleteOnExit(); - - dictionaryWriter = new DictionaryWriter(dictionaryFile.toFile(), 1L<<16, false); - - indexFile = Files.createTempFile("tmp", ".idx"); - indexFile.toFile().deleteOnExit(); - writer = new SearchIndexWriterImpl(dictionaryWriter, indexFile.toFile()); - - wordsFile1 = Files.createTempFile("words1", ".idx"); - urlsFile1 = Files.createTempFile("urls1", ".idx"); - } - - @SneakyThrows - @AfterEach - void tearDown() { - dictionaryWriter.close(); - writer.close(); - indexFile.toFile().delete(); - dictionaryFile.toFile().delete(); - urlsFile1.toFile().delete(); - wordsFile1.toFile().delete(); - } - - public long[] findWord(SearchIndexReader reader, String word, IndexBlock block) { - IndexSearchBudget budget = new IndexSearchBudget(100); - return reader.findWord(block, budget, lv->true, dictionaryWriter.getReadOnly(word)).stream().toArray(); - } - - @Test @SneakyThrows - void put() throws IOException { - writer.put(new EdgeId<>(0), new EdgeId<>(1), IndexBlock.Words, Arrays.asList("Hello", "Salvete", "everyone!", "This", "is", "Bob")); - writer.put(new EdgeId<>(0), new EdgeId<>(2), IndexBlock.Words, Arrays.asList("Salvete", "omnes!", "Bob", "sum", "Hello")); - writer.forceWrite(); - - new SearchIndexConverter(IndexBlock.Words, 0, Path.of("/tmp"), indexFile.toFile(), wordsFile1.toFile(), urlsFile1.toFile(), new SearchIndexPartitioner(null), val -> false); - - EnumMap indices = new EnumMap(IndexBlock.class); - indices.put(IndexBlock.Words, new SearchIndex("0", urlsFile1.toFile(), wordsFile1.toFile())); - - var reader = new SearchIndexReader(indices); - - int bobId = dictionaryWriter.getReadOnly("Bob"); - assertNotEquals(NO_VALUE, bobId); - - assertEquals(2, reader.numHits(IndexBlock.Words, bobId)); - assertArrayEquals(new long[] { 1, 2 }, findWord(reader,"Bob", IndexBlock.Words)); - assertArrayEquals(new long[] { 2 }, findWord(reader,"sum", IndexBlock.Words)); - assertArrayEquals(new long[] { }, findWord(reader,"New Word", IndexBlock.Words)); - - writer.close(); - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java index 1780b6bb..8e58b117 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java @@ -67,4 +67,37 @@ class RandomWriteFunnelTest { } } } + + + @Test + public void testYuge() { + new File("/tmp/test.bin").delete(); + for (int j = 1; j <= 20; j++) { + try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 10, j); + var out = new RandomAccessFile("/tmp/test.bin", "rw")) { + for (int i = 10 - 1; i >= 0; i -= 2) { + funnel.put(i, Long.MAX_VALUE - i); + } + funnel.write(out.getChannel()); + + } catch (Exception e) { + e.printStackTrace(); + } + + try (var in = new RandomAccessFile("/tmp/test.bin", "r")) { + in.readLong(); + in.readLong(); + in.readLong(); + in.readLong(); + in.readLong(); + in.readLong(); + in.readLong(); + in.readLong(); + in.readLong(); + in.readLong(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } } \ No newline at end of file diff --git a/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java b/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java index 80e05c64..0698c5c3 100644 --- a/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java +++ b/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java @@ -3,18 +3,15 @@ package com.upserve.uppend.blobs; import jnr.ffi.*; import jnr.ffi.types.size_t; -import org.slf4j.Logger; import com.kenai.jffi.MemoryIO; import java.io.IOException; -import java.lang.invoke.MethodHandles; import java.nio.*; // https://github.com/upserve/uppend/blob/70967c6f24d7f1a3bbc18799f485d981da93f53b/src/main/java/com/upserve/uppend/blobs/NativeIO.java // MIT License public class NativeIO { - private static final Logger log = org.slf4j.LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final NativeC nativeC = LibraryLoader.create(NativeC.class).load("c"); public static final int pageSize = nativeC.getpagesize(); // 4096 on most Linux From 420b9bb7e0483f6047c8eaf2360bf44cc223965e Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 20 Jun 2022 12:02:01 +0200 Subject: [PATCH 04/40] Refactoring BTreeReader and binary search code --- .../nu/marginalia/util/btree/BTreeReader.java | 110 ++++------ .../nu/marginalia/util/btree/BTreeWriter.java | 88 ++++---- .../util/btree/model/BTreeContext.java | 11 +- .../util/btree/model/BTreeHeader.java | 13 +- .../util/multimap/MultimapFileLong.java | 32 ++- .../multimap/MultimapFileLongOffsetSlice.java | 5 + .../util/multimap/MultimapFileLongSlice.java | 2 + .../util/multimap/MultimapSearcher.java | 192 +++++++----------- .../util/multimap/MultimapSearcherBase.java | 143 +++++++++++++ .../edge/index/reader/IndexWordsTable.java | 6 +- .../wmsa/edge/index/reader/SearchIndex.java | 2 +- .../util/btree/BTreeWriterTest.java | 32 +-- .../util/hash/LongPairHashMapTest.java | 4 +- .../edge/index/service/MultimapFileTest.java | 14 +- 14 files changed, 380 insertions(+), 274 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java index ec8f204b..de675776 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java @@ -11,94 +11,68 @@ public class BTreeReader { private final MultimapFileLong file; private final BTreeContext ctx; + private final Logger logger = LoggerFactory.getLogger(BTreeReader.class); - private final long mask; - private final MultimapSearcher searcher; + + private final MultimapSearcher indexSearcher; + private final MultimapSearcher dataSearcher; public BTreeReader(MultimapFileLong file, BTreeContext ctx) { this.file = file; - this.searcher = file.createSearcher(); + this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1); + this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize()); + this.ctx = ctx; - this.mask = ctx.equalityMask(); } - public long fileSize() { - return file.size(); + public BTreeHeader getHeader(long fileOffset) { + return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2)); } - public BTreeHeader getHeader(long offset) { - return new BTreeHeader(file.get(offset), file.get(offset+1), file.get(offset+2)); - } + /** + * + * @return file offset of entry matching keyRaw, negative if absent + */ + public long findEntry(BTreeHeader header, final long keyRaw) { + final long key = keyRaw & ctx.equalityMask(); - public long offsetForEntry(BTreeHeader header, final long keyRaw) { - final long key = keyRaw & mask; + final long dataAddress = header.dataOffsetLongs(); + final int entrySize = ctx.entrySize(); + final int blockSize = ctx.BLOCK_SIZE_WORDS(); - if (header.layers() == 0) { - return trivialSearch(header, key); + if (header.layers() == 0) { // For small data, we only have a data block + return dataSearcher.binarySearchUpperBound(key, dataAddress, header.numEntries()); } - long p = searchEntireTopLayer(header, key); - if (p < 0) return -1; + final long indexOffset = header.indexOffsetLongs(); - long cumOffset = p * ctx.BLOCK_SIZE_WORDS(); + // Search the top layer + long layerOffset = indexSearch(key, indexOffset, blockSize); + if (layerOffset < 0) return -1; + + // Search intermediary layers for (int i = header.layers() - 2; i >= 0; --i) { - long offsetBase = header.indexOffsetLongs() + header.relativeLayerOffset(ctx, i); - p = searchLayerBlock(key, offsetBase+cumOffset); - if (p < 0) + final long layerAddressBase = indexOffset + header.relativeIndexLayerOffset(ctx, i); + final long layerBlockOffset = layerAddressBase + layerOffset; + + final long nextLayerOffset = indexSearch(key, layerBlockOffset, blockSize); + if (nextLayerOffset < 0) return -1; - cumOffset = ctx.BLOCK_SIZE_WORDS()*(p + cumOffset); + + layerOffset = blockSize*(nextLayerOffset + layerOffset); } - long dataMax = header.dataOffsetLongs() + (long) header.numEntries() * ctx.entrySize(); - return searchDataBlock(key, - header.dataOffsetLongs() + ctx.entrySize()*cumOffset, - dataMax); + // Search the corresponding data block + final long searchStart = dataAddress + layerOffset * entrySize; + final long lastDataAddress = dataAddress + (long) header.numEntries() * entrySize; + final long lastItemInBlockAddress = searchStart + (long) blockSize * entrySize; + final long searchEnd = Math.min(lastItemInBlockAddress, lastDataAddress); + + return dataSearcher.binarySearchUpperBound(key, searchStart, (searchEnd - searchStart) / entrySize); } - - private long searchEntireTopLayer(BTreeHeader header, long key) { - long offset = header.indexOffsetLongs(); - - return searcher.binarySearchUpperBound(key, offset, offset + ctx.BLOCK_SIZE_WORDS()) - offset; - } - - private long searchLayerBlock(long key, long blockOffset) { - if (blockOffset < 0) - return blockOffset; - - return searcher.binarySearchUpperBound(key, blockOffset, blockOffset + ctx.BLOCK_SIZE_WORDS()) - blockOffset; - } - - - private long searchDataBlock(long key, long blockOffset, long dataMax) { - if (blockOffset < 0) - return blockOffset; - - long lastOffset = Math.min(blockOffset+ctx.BLOCK_SIZE_WORDS()*(long)ctx.entrySize(), dataMax); - int length = (int)(lastOffset - blockOffset); - - if (ctx.entrySize() == 1) { - if (mask == ~0L) return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length); - return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length, mask); - } - - return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, ctx.entrySize(), length/ctx.entrySize(), mask); - } - - private long trivialSearch(BTreeHeader header, long key) { - long offset = header.dataOffsetLongs(); - - if (ctx.entrySize() == 1) { - if (mask == ~0L) { - return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries()); - } - else { - return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries(), mask); - } - } - - return searcher.binarySearchUpperBoundNoMiss(key, offset, ctx.entrySize(), header.numEntries(), mask); - + private long indexSearch(long key, long start, long n) { + return indexSearcher.binarySearch(key, start, n) - start; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java index b43faca7..0c1f0789 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java @@ -2,16 +2,12 @@ package nu.marginalia.util.btree; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; -import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLongSlice; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; public class BTreeWriter { - private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class); private final BTreeContext ctx; private final MultimapFileLongSlice map; @@ -27,7 +23,7 @@ public class BTreeWriter { long size = 0; for (int layer = 0; layer < numLayers; layer++) { - size += ctx.layerSize(numWords, layer); + size += ctx.indexLayerSize(numWords, layer); } return size; } @@ -45,17 +41,17 @@ public class BTreeWriter { writeIndexCallback.write(map.atOffset(header.dataOffsetLongs())); - if (header.layers() < 1) { + if (header.layers() < 1) { // The data is too small to benefit from indexing + return ctx.calculateSize(numEntries); + } + else { + writeIndex(header); return ctx.calculateSize(numEntries); } - - writeIndex(header); - - return ctx.calculateSize(numEntries); } public static BTreeHeader makeHeader(BTreeContext ctx, long offset, int numEntries) { - final int numLayers = ctx.numLayers(numEntries); + final int numLayers = ctx.numIndexLayers(numEntries); final int padding = BTreeHeader.getPadding(ctx, offset, numLayers); @@ -71,46 +67,50 @@ public class BTreeWriter { private void writeIndex(BTreeHeader header) { - var layerOffsets = getRelativeLayerOffsets(header); + var layerOffsets = header.getRelativeLayerOffsets(ctx); - long stride = ctx.BLOCK_SIZE_WORDS(); + long indexedDataStepSize = ctx.BLOCK_SIZE_WORDS(); + + /* Index layer 0 indexes the data itself + Index layer 1 indexes layer 0 + Index layer 2 indexes layer 1 + And so on + */ for (int layer = 0; layer < header.layers(); layer++, - stride*=ctx.BLOCK_SIZE_WORDS()) { - long indexWord = 0; - long offsetBase = layerOffsets[layer] + header.indexOffsetLongs(); - long numEntries = header.numEntries(); - for (long idx = 0; idx < numEntries; idx += stride, indexWord++) { - long dataOffset = header.dataOffsetLongs() + (idx + (stride-1)) * ctx.entrySize(); - long val; + indexedDataStepSize*=ctx.BLOCK_SIZE_WORDS()) { - if (idx + (stride-1) < numEntries) { - val = map.get(dataOffset) & ctx.equalityMask(); - } - else { - val = Long.MAX_VALUE; - } - if (offsetBase + indexWord < 0) { - logger.error("bad put @ {}", offsetBase + indexWord); - logger.error("layer{}", layer); - logger.error("layer offsets {}", layerOffsets); - logger.error("offsetBase = {}", offsetBase); - logger.error("numEntries = {}", numEntries); - logger.error("indexWord = {}", indexWord); - } - map.put(offsetBase + indexWord, val); - } - for (; (indexWord % ctx.BLOCK_SIZE_WORDS()) != 0; indexWord++) { - map.put(offsetBase + indexWord, Long.MAX_VALUE); - } + writeIndexLayer(header, layerOffsets, indexedDataStepSize, layer); } } - private long[] getRelativeLayerOffsets(BTreeHeader header) { - long[] layerOffsets = new long[header.layers()]; - for (int i = 0; i < header.layers(); i++) { - layerOffsets[i] = header.relativeLayerOffset(ctx, i); + private void writeIndexLayer(BTreeHeader header, long[] layerOffsets, + final long indexedDataStepSize, + final int layer) { + + final long indexOffsetBase = layerOffsets[layer] + header.indexOffsetLongs(); + final long dataOffsetBase = header.dataOffsetLongs(); + + final long dataEntriesMax = header.numEntries(); + final int entrySize = ctx.entrySize(); + + final long lastDataEntryOffset = indexedDataStepSize - 1; + + long indexWord = 0; + + for (long dataPtr = 0; + dataPtr + lastDataEntryOffset < dataEntriesMax; + dataPtr += indexedDataStepSize) + { + long dataOffset = dataOffsetBase + (dataPtr + lastDataEntryOffset) * entrySize; + map.put(indexOffsetBase + indexWord++, map.get(dataOffset) & ctx.equalityMask()); } - return layerOffsets; + + // Fill the remaining block with LONG_MAX + map.setRange(indexOffsetBase+indexWord, + (int) (ctx.BLOCK_SIZE_WORDS() - (indexWord % ctx.BLOCK_SIZE_WORDS())), + Long.MAX_VALUE); } + + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java index 4655946c..a7d6b22b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java @@ -10,7 +10,6 @@ public record BTreeContext(int MAX_LAYERS, public BTreeContext(int MAX_LAYERS, int entrySize, long equalityMask, int BLOCK_SIZE_BITS) { this(MAX_LAYERS, entrySize, equalityMask, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS); - } public long calculateSize(int numEntries) { @@ -19,7 +18,7 @@ public record BTreeContext(int MAX_LAYERS, return header.dataOffsetLongs() + (long)numEntries * entrySize; } - public int numLayers(int numEntries) { + public int numIndexLayers(int numEntries) { if (numEntries <= BLOCK_SIZE_WORDS*2) { return 0; } @@ -36,11 +35,7 @@ public record BTreeContext(int MAX_LAYERS, return MAX_LAYERS; } - public long layerSize(int numEntries, int level) { - return BLOCK_SIZE_WORDS * numBlocks(numEntries, level); - } - - private long numBlocks(int numWords, int level) { + public long indexLayerSize(int numWords, int level) { long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1)); int numBlocks = 0; @@ -50,7 +45,7 @@ public record BTreeContext(int MAX_LAYERS, numBlocks++; } - return numBlocks; + return (long) BLOCK_SIZE_WORDS * numBlocks; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java index 8d68b424..8cdcd355 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java @@ -1,6 +1,5 @@ package nu.marginalia.util.btree.model; -import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLongSlice; public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) { @@ -36,12 +35,20 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon } - public long relativeLayerOffset(BTreeContext ctx, int n) { + public long relativeIndexLayerOffset(BTreeContext ctx, int n) { long offset = 0; for (int i = n+1; i < layers; i++) { - offset += ctx.layerSize( numEntries, i); + offset += ctx.indexLayerSize( numEntries, i); } return offset; } + public long[] getRelativeLayerOffsets(BTreeContext ctx) { + long[] layerOffsets = new long[layers()]; + for (int i = 0; i < layers(); i++) { + layerOffsets[i] = relativeIndexLayerOffset(ctx, i); + } + return layerOffsets; + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java index e9a9b4fe..00ccd82c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java @@ -97,8 +97,8 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { readableSize(mapSizeBytes), readableSize(8L*bufferSizeWords), mode); } - public MultimapSearcher createSearcher() { - return new MultimapSearcher(this); + public MultimapSearcherBase createSearcher() { + return new MultimapSearcherBase(this); } public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) { return new MultimapSorter(this, tmpFile, internalSortLimit); @@ -332,6 +332,34 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } + @Override + public void setRange(long idx, int n, long val) { + if (n == 0) return; + + if (idx+n >= mappedSize) { + grow(idx+n); + } + int iN = (int)((idx + n) / bufferSize); + + for (int i = 0; i < n; ) { + int i0 = (int)((idx + i) / bufferSize); + + int bufferOffset = (int) ((idx+i) % bufferSize); + var buffer = buffers.get(i0); + + final int l; + + if (i0 < iN) l = bufferSize - bufferOffset; + else l = Math.min(n - i, bufferSize - bufferOffset); + + for (int p = 0; p < l; p++) { + buffer.put(bufferOffset + p, val); + } + + i+=l; + } + } + @Override public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java index bd35bd9b..f379d1c6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java @@ -23,6 +23,11 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice { map.put(off+idx, val); } + @Override + public void setRange(long idx, int n, long val) { + map.setRange(off+idx, n, val); + } + @Override public long get(long idx) { return map.get(off+idx); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java index 27d6ae06..29f9994d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java @@ -9,6 +9,8 @@ public interface MultimapFileLongSlice { void put(long idx, long val); + void setRange(long idx, int n, long val); + long get(long idx); void read(long[] vals, long idx); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java index 005888d8..886912c5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java @@ -1,128 +1,80 @@ package nu.marginalia.util.multimap; -import lombok.experimental.Delegate; +public interface MultimapSearcher { + long binarySearch(long key, long fromIndex, long n); + long binarySearchUpperBound(long key, long fromIndex, long n); -public class MultimapSearcher { - @Delegate - private final MultimapFileLongSlice mmf; - - public MultimapSearcher(MultimapFileLongSlice mmf) { - this.mmf = mmf; - } - - public boolean binarySearch(long key, long fromIndex, long toIndex) { - - long low = fromIndex; - long high = toIndex - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(mid); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return true; // key found + static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) { + if (mask == ~0L && stepSize == 1) { + return new SimpleMultimapSearcher(new MultimapSearcherBase(slice)); } - return false; // key not found. - } - - public long binarySearchUpperBound(long key, long fromIndex, long toIndex) { - - long low = fromIndex; - long high = toIndex - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(mid); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return mid; + else if (stepSize == 1) { + return new MaskedMultimapSearcher(new MultimapSearcherBase(slice), mask); } - return low; - } - - public long binarySearchUpperBound(long key, long fromIndex, long toIndex, long mask) { - - long low = fromIndex; - long high = toIndex - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(mid) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return mid; + else { + return new SteppingMaskedMultimapSearcher(new MultimapSearcherBase(slice), mask, stepSize); } - return low; - } - - public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex) { - - long low = fromIndex; - long high = toIndex - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(mid); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return mid; - } - return -1; - } - - - public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex, long mask) { - - long low = fromIndex; - long high = toIndex - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(mid) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return mid; - } - return -1; - } - - - public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long step, long steps, long mask) { - - long low = 0; - long high = steps - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(fromIndex + mid*step) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid*step; - } - return -1; } } + +class SimpleMultimapSearcher implements MultimapSearcher { + private final MultimapSearcherBase base; + + SimpleMultimapSearcher(MultimapSearcherBase base) { + this.base = base; + } + + @Override + public long binarySearch(long key, long fromIndex, long n) { + return base.binarySearchOffset(key, fromIndex, n); + } + + @Override + public long binarySearchUpperBound(long key, long fromIndex, long n) { + return base.binarySearchUpperBound(key, fromIndex, n); + } +} + + +class MaskedMultimapSearcher implements MultimapSearcher { + private final MultimapSearcherBase base; + private final long mask; + + MaskedMultimapSearcher(MultimapSearcherBase base, long mask) { + this.base = base; + this.mask = mask; + } + + @Override + public long binarySearch(long key, long fromIndex, long n) { + return base.binarySearchOffset(key, fromIndex, n, mask); + } + + @Override + public long binarySearchUpperBound(long key, long fromIndex, long n) { + return base.binarySearchUpperBound(key, fromIndex, n, mask); + } +} + + +class SteppingMaskedMultimapSearcher implements MultimapSearcher { + private final MultimapSearcherBase base; + private final long mask; + private final int step; + + SteppingMaskedMultimapSearcher(MultimapSearcherBase base, long mask, int step) { + this.base = base; + this.mask = mask; + this.step = step; + } + + @Override + public long binarySearch(long key, long fromIndex, long n) { + return base.binarySearchOffset(key, fromIndex, step, n, mask); + } + + @Override + public long binarySearchUpperBound(long key, long fromIndex, long n) { + return base.binarySearchUpperBound(key, fromIndex, step, n, mask); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java new file mode 100644 index 00000000..2bd8c166 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java @@ -0,0 +1,143 @@ +package nu.marginalia.util.multimap; + +import lombok.experimental.Delegate; + +public class MultimapSearcherBase { + @Delegate + private final MultimapFileLongSlice mmf; + + public MultimapSearcherBase(MultimapFileLongSlice mmf) { + this.mmf = mmf; + } + + public boolean binarySearchTest(long key, long fromIndex, long n) { + + long low = 0; + long high = n - 1; + + while (low <= high) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return true; + } + return false; + } + + public long binarySearchOffset(long key, long fromIndex, long n) { + long low = 0; + long high = n - 1; + + while (low <= high) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + return fromIndex + low; + } + + + public long binarySearchOffset(long key, long fromIndex, long n, long mask) { + long low = 0; + long high = n - 1; + + while (low <= high) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid) & mask; + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + return fromIndex + low; + } + + + public long binarySearchOffset(long key, long fromIndex, int step, long n, long mask) { + long low = 0; + long high = n - 1; + + while (low <= high) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid*step) & mask; + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid*step; + } + return fromIndex + low; + } + + public long binarySearchUpperBound(long key, long fromIndex, long n) { + long low = 0; + long high = n - 1; + + while (low <= high) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + return -1; + } + + + public long binarySearchUpperBound(long key, long fromIndex, long n, long mask) { + long low = 0; + long high = n - 1; + + while (low <= high) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid) & mask; + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + return -1; + } + + + public long binarySearchUpperBound(long key, long fromIndex, int step, long n, long mask) { + long low = 0; + long high = n - 1; + + while (low <= high) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid*step) & mask; + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid*step; + } + return -1; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java index 2bde1aa7..681e42ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java @@ -45,12 +45,12 @@ public class IndexWordsTable implements AutoCloseable { private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { return new MultimapFileLong(wordsFile, - FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false); + FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE); } public long positionForWord(int wordId) { - long offset = reader.offsetForEntry(header, wordId); + long offset = reader.findEntry(header, wordId); if (offset < 0) { return -1L; } @@ -60,7 +60,7 @@ public class IndexWordsTable implements AutoCloseable { public int wordLength(int wordId) { - long offset = reader.offsetForEntry(header, wordId); + long offset = reader.findEntry(header, wordId); if (offset < 0) { return -1; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java index 042f8f54..0ab4d80b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java @@ -82,7 +82,7 @@ public class SearchIndex implements AutoCloseable { if (!range.isPresent()) return false; - return bTreeReader.offsetForEntry(bTreeReader.getHeader(range.dataOffset), url) >= 0; + return bTreeReader.findEntry(bTreeReader.getHeader(range.dataOffset), url) >= 0; } public class UrlIndexTree { diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java index 875cda37..73aa4dc3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java @@ -48,9 +48,9 @@ class BTreeWriterTest { @Test void testLayerOffset() { int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); - System.out.println(writer.makeHeader(1025, wcub).relativeLayerOffset(ctx, 0)); - System.out.println(writer.makeHeader(1025, wcub).relativeLayerOffset(ctx, 1)); - System.out.println(writer.makeHeader(1025, wcub).relativeLayerOffset(ctx, 2)); + System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 0)); + System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 1)); + System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 2)); for (int i = 0; i < 1024; i++) { var header = writer.makeHeader(0, i); @@ -59,7 +59,7 @@ class BTreeWriterTest { printTreeLayout(i, header, ctx); if (header.layers() >= 1) { - assertEquals(1, ctx.layerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS()); + assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS()); } } } @@ -67,7 +67,7 @@ class BTreeWriterTest { private void printTreeLayout(int numEntries, BTreeHeader header, BTreeContext ctx) { StringJoiner sj = new StringJoiner(","); for (int l = 0; l < header.layers(); l++) { - sj.add(""+ctx.layerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS()); + sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS()); } System.out.println(numEntries + ":" + sj); } @@ -86,7 +86,7 @@ class BTreeWriterTest { try { RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); + MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); { var writer = new BTreeWriter(mmf, ctx); @@ -103,7 +103,7 @@ class BTreeWriterTest { var reader = new BTreeReader(mmf, ctx); var header = reader.getHeader(0); for (int i = 0; i < data.length; i++) { - long offset = reader.offsetForEntry(header, data[i]); + long offset = reader.findEntry(header, data[i]); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(i, mmf.get(offset+1)); } @@ -129,7 +129,7 @@ class BTreeWriterTest { try { RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); + MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); { var writer = new BTreeWriter(mmf, ctx); @@ -146,7 +146,7 @@ class BTreeWriterTest { var reader = new BTreeReader(mmf, ctx); var header = reader.getHeader(0); for (int i = 0; i < data.length; i++) { - long offset = reader.offsetForEntry(header, data[i]); + long offset = reader.findEntry(header, data[i]); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(i, mmf.get(offset+1)); } @@ -154,7 +154,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long)(Long.MAX_VALUE * Math.random()); while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.offsetForEntry(header, val)); + assertEquals(-1, reader.findEntry(header, val)); } } } catch (Exception e) { @@ -197,7 +197,7 @@ class BTreeWriterTest { printTreeLayout(toPut.size(), header, ctx); for (int i = 0; i < data.length; i++) { - long offset = reader.offsetForEntry(header, data[i]); + long offset = reader.findEntry(header, data[i]); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(data[i], mmf.get(offset)); } @@ -205,7 +205,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.offsetForEntry(header, val)); + assertEquals(-1, reader.findEntry(header, val)); } } } catch (Exception e) { @@ -250,7 +250,7 @@ class BTreeWriterTest { printTreeLayout(toPut.size(), header, ctx); for (int i = 0; i < data.length; i++) { - long offset = reader.offsetForEntry(header, data[i] & mask); + long offset = reader.findEntry(header, data[i] & mask); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(data[i], mmf.get(offset)); } @@ -258,7 +258,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.offsetForEntry(header, val & mask)); + assertEquals(-1, reader.findEntry(header, val & mask)); } } } catch (Exception e) { @@ -304,7 +304,7 @@ class BTreeWriterTest { printTreeLayout(toPut.size(), header, ctx); for (int i = 0; i < data.length; i++) { - long offset = reader.offsetForEntry(header, data[i] & mask); + long offset = reader.findEntry(header, data[i] & mask); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(data[i], mmf.get(offset)); assertEquals(i, mmf.get(offset+1)); @@ -313,7 +313,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.offsetForEntry(header, val & mask)); + assertEquals(-1, reader.findEntry(header, val & mask)); } } } catch (Exception e) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java index 9331a998..d2bec272 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java @@ -26,7 +26,7 @@ class LongPairHashMapTest { try { RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); + MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); var lphm = LongPairHashMap.createNew(mmf, 1024); toPut.forEach(i -> { lphm.put(new LongPairHashMap.CellData(i, i)); @@ -35,7 +35,7 @@ class LongPairHashMapTest { lphm.close(); RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); + MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000); var lphm2 = LongPairHashMap.loadExisting(mmf2); toPut.forEach(i -> { Assertions.assertTrue(lphm2.get(i).isSet()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java index 44e4207a..bb7b360e 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java @@ -56,7 +56,7 @@ class MultimapFileTest { @SneakyThrows @Test void put() { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false); + var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); for (int i = 0; i < 32; i++) { file.put(i, i); } @@ -68,7 +68,7 @@ class MultimapFileTest { @SneakyThrows @Test void read() { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false); + var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); for (int i = 0; i < 32; i++) { file.put(i, i); } @@ -85,7 +85,7 @@ class MultimapFileTest { @Test void write() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false); + var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); for (int i = 0; i < 32-6; i++) { file.write(new long[] { 0,1,2,3,4,5}, i); @@ -98,7 +98,7 @@ class MultimapFileTest { @Test void sortInternal() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false); + var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); var sorter = file.createSorter(Path.of("/tmp"), 16); var searcher = file.createSearcher(); for (int i = 0; i < 32; i++) { @@ -109,13 +109,13 @@ class MultimapFileTest { for (int i = 2+1; i < 16; i++) { assertTrue(file.get(i) > file.get(i-1)); - assertTrue(searcher.binarySearch(file.get(i), 2, 18)); + assertTrue(searcher.binarySearchTest(file.get(i), 2, 16)); } } @Test void sortExternal() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false); + var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); var sorter = file.createSorter(Path.of("/tmp"), 2); var searcher = file.createSearcher(); @@ -128,7 +128,7 @@ class MultimapFileTest { for (int i = 2+1; i < 16; i++) { assertTrue(file.get(i) > file.get(i-1)); - assertTrue(searcher.binarySearch(file.get(i), 2, 18)); + assertTrue(searcher.binarySearchTest(file.get(i), 2, 16)); } } From c324c80efca4abe9c5cd23db365231333c0293da Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 20 Jun 2022 12:04:06 +0200 Subject: [PATCH 05/40] Refactoring BTreeReader and binary search code --- .../nu/marginalia/util/btree/BTreeReader.java | 6 ++--- .../util/multimap/MultimapSearcher.java | 26 +++++++++---------- .../util/multimap/MultimapSearcherBase.java | 12 ++++----- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java index de675776..e0f3851a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java @@ -41,7 +41,7 @@ public class BTreeReader { final int blockSize = ctx.BLOCK_SIZE_WORDS(); if (header.layers() == 0) { // For small data, we only have a data block - return dataSearcher.binarySearchUpperBound(key, dataAddress, header.numEntries()); + return dataSearcher.binarySearch(key, dataAddress, header.numEntries()); } final long indexOffset = header.indexOffsetLongs(); @@ -68,11 +68,11 @@ public class BTreeReader { final long lastItemInBlockAddress = searchStart + (long) blockSize * entrySize; final long searchEnd = Math.min(lastItemInBlockAddress, lastDataAddress); - return dataSearcher.binarySearchUpperBound(key, searchStart, (searchEnd - searchStart) / entrySize); + return dataSearcher.binarySearch(key, searchStart, (searchEnd - searchStart) / entrySize); } private long indexSearch(long key, long start, long n) { - return indexSearcher.binarySearch(key, start, n) - start; + return indexSearcher.binarySearchUpper(key, start, n) - start; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java index 886912c5..dd339e40 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java @@ -1,8 +1,8 @@ package nu.marginalia.util.multimap; public interface MultimapSearcher { + long binarySearchUpper(long key, long fromIndex, long n); long binarySearch(long key, long fromIndex, long n); - long binarySearchUpperBound(long key, long fromIndex, long n); static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) { if (mask == ~0L && stepSize == 1) { @@ -25,13 +25,13 @@ class SimpleMultimapSearcher implements MultimapSearcher { } @Override - public long binarySearch(long key, long fromIndex, long n) { - return base.binarySearchOffset(key, fromIndex, n); + public long binarySearchUpper(long key, long fromIndex, long n) { + return base.binarySearchUpper(key, fromIndex, n); } @Override - public long binarySearchUpperBound(long key, long fromIndex, long n) { - return base.binarySearchUpperBound(key, fromIndex, n); + public long binarySearch(long key, long fromIndex, long n) { + return base.binarySearch(key, fromIndex, n); } } @@ -46,13 +46,13 @@ class MaskedMultimapSearcher implements MultimapSearcher { } @Override - public long binarySearch(long key, long fromIndex, long n) { - return base.binarySearchOffset(key, fromIndex, n, mask); + public long binarySearchUpper(long key, long fromIndex, long n) { + return base.binarySearchUpper(key, fromIndex, n, mask); } @Override - public long binarySearchUpperBound(long key, long fromIndex, long n) { - return base.binarySearchUpperBound(key, fromIndex, n, mask); + public long binarySearch(long key, long fromIndex, long n) { + return base.binarySearch(key, fromIndex, n, mask); } } @@ -69,12 +69,12 @@ class SteppingMaskedMultimapSearcher implements MultimapSearcher { } @Override - public long binarySearch(long key, long fromIndex, long n) { - return base.binarySearchOffset(key, fromIndex, step, n, mask); + public long binarySearchUpper(long key, long fromIndex, long n) { + return base.binarySearchUpper(key, fromIndex, step, n, mask); } @Override - public long binarySearchUpperBound(long key, long fromIndex, long n) { - return base.binarySearchUpperBound(key, fromIndex, step, n, mask); + public long binarySearch(long key, long fromIndex, long n) { + return base.binarySearch(key, fromIndex, step, n, mask); } } \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java index 2bd8c166..30549a8c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java @@ -29,7 +29,7 @@ public class MultimapSearcherBase { return false; } - public long binarySearchOffset(long key, long fromIndex, long n) { + public long binarySearchUpper(long key, long fromIndex, long n) { long low = 0; long high = n - 1; @@ -48,7 +48,7 @@ public class MultimapSearcherBase { } - public long binarySearchOffset(long key, long fromIndex, long n, long mask) { + public long binarySearchUpper(long key, long fromIndex, long n, long mask) { long low = 0; long high = n - 1; @@ -67,7 +67,7 @@ public class MultimapSearcherBase { } - public long binarySearchOffset(long key, long fromIndex, int step, long n, long mask) { + public long binarySearchUpper(long key, long fromIndex, int step, long n, long mask) { long low = 0; long high = n - 1; @@ -85,7 +85,7 @@ public class MultimapSearcherBase { return fromIndex + low; } - public long binarySearchUpperBound(long key, long fromIndex, long n) { + public long binarySearch(long key, long fromIndex, long n) { long low = 0; long high = n - 1; @@ -104,7 +104,7 @@ public class MultimapSearcherBase { } - public long binarySearchUpperBound(long key, long fromIndex, long n, long mask) { + public long binarySearch(long key, long fromIndex, long n, long mask) { long low = 0; long high = n - 1; @@ -123,7 +123,7 @@ public class MultimapSearcherBase { } - public long binarySearchUpperBound(long key, long fromIndex, int step, long n, long mask) { + public long binarySearch(long key, long fromIndex, int step, long n, long mask) { long low = 0; long high = n - 1; From b1eff0107ce9a75c856e90deae5dd732035bf57a Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 20 Jun 2022 12:25:34 +0200 Subject: [PATCH 06/40] Refactoring BTreeReader and binary search code --- .../nu/marginalia/util/btree/BTreeReader.java | 57 ++++++++++--------- .../util/btree/model/BTreeContext.java | 11 ++-- 2 files changed, 33 insertions(+), 35 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java index e0f3851a..ab1b7a97 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java @@ -4,16 +4,15 @@ import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapSearcher; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.jetbrains.annotations.Nullable; + +import static java.lang.Math.min; public class BTreeReader { private final MultimapFileLong file; private final BTreeContext ctx; - private final Logger logger = LoggerFactory.getLogger(BTreeReader.class); - private final MultimapSearcher indexSearcher; private final MultimapSearcher dataSearcher; @@ -35,40 +34,42 @@ public class BTreeReader { */ public long findEntry(BTreeHeader header, final long keyRaw) { final long key = keyRaw & ctx.equalityMask(); - - final long dataAddress = header.dataOffsetLongs(); - final int entrySize = ctx.entrySize(); final int blockSize = ctx.BLOCK_SIZE_WORDS(); + final long dataAddress = header.dataOffsetLongs(); if (header.layers() == 0) { // For small data, we only have a data block return dataSearcher.binarySearch(key, dataAddress, header.numEntries()); } - final long indexOffset = header.indexOffsetLongs(); - - // Search the top layer - long layerOffset = indexSearch(key, indexOffset, blockSize); - if (layerOffset < 0) return -1; - - // Search intermediary layers - for (int i = header.layers() - 2; i >= 0; --i) { - final long layerAddressBase = indexOffset + header.relativeIndexLayerOffset(ctx, i); - final long layerBlockOffset = layerAddressBase + layerOffset; - - final long nextLayerOffset = indexSearch(key, layerBlockOffset, blockSize); - if (nextLayerOffset < 0) - return -1; - - layerOffset = blockSize*(nextLayerOffset + layerOffset); + // Search index layers + long dataLayerOffset = searchIndex(header, key); + if (dataLayerOffset < 0) { + return dataLayerOffset; } // Search the corresponding data block - final long searchStart = dataAddress + layerOffset * entrySize; - final long lastDataAddress = dataAddress + (long) header.numEntries() * entrySize; - final long lastItemInBlockAddress = searchStart + (long) blockSize * entrySize; - final long searchEnd = Math.min(lastItemInBlockAddress, lastDataAddress); + final long searchStart = dataAddress + dataLayerOffset * ctx.entrySize(); + final long numEntries = min(header.numEntries() - dataLayerOffset, blockSize); - return dataSearcher.binarySearch(key, searchStart, (searchEnd - searchStart) / entrySize); + return dataSearcher.binarySearch(key, searchStart, numEntries); + } + + private long searchIndex(BTreeHeader header, long key) { + final int blockSize = ctx.BLOCK_SIZE_WORDS(); + final long indexAddress = header.indexOffsetLongs(); + + long layerOffset = 0; + + for (int i = header.layers() - 1; i >= 0; --i) { + final long layerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset; + + final long nextLayerOffset = indexSearch(key, indexAddress + layerBlockOffset, blockSize); + if (nextLayerOffset < 0) + return -1; + + layerOffset = blockSize *(nextLayerOffset + layerOffset); + } + return layerOffset; } private long indexSearch(long key, long start, long n) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java index a7d6b22b..e91b71fd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java @@ -36,16 +36,13 @@ public record BTreeContext(int MAX_LAYERS, } public long indexLayerSize(int numWords, int level) { + final long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1)); + final long numBlocks = numWords / layerSize; - long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1)); - int numBlocks = 0; - - numBlocks += numWords / layerSize; if (numWords % layerSize != 0) { - numBlocks++; + return BLOCK_SIZE_WORDS * (numBlocks + 1); } - - return (long) BLOCK_SIZE_WORDS * numBlocks; + return BLOCK_SIZE_WORDS * numBlocks; } } From 8139ab0d1d37b88f19662e2c527344b850236b8c Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 20 Jun 2022 12:28:15 +0200 Subject: [PATCH 07/40] Refactoring BTreeReader and binary search code --- .../src/main/java/nu/marginalia/util/btree/BTreeReader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java index ab1b7a97..42605c04 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java @@ -4,7 +4,6 @@ import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapSearcher; -import org.jetbrains.annotations.Nullable; import static java.lang.Math.min; @@ -33,8 +32,9 @@ public class BTreeReader { * @return file offset of entry matching keyRaw, negative if absent */ public long findEntry(BTreeHeader header, final long keyRaw) { - final long key = keyRaw & ctx.equalityMask(); final int blockSize = ctx.BLOCK_SIZE_WORDS(); + + final long key = keyRaw & ctx.equalityMask(); final long dataAddress = header.dataOffsetLongs(); if (header.layers() == 0) { // For small data, we only have a data block From 1068694db681bc0624eff917300c4ba3d1769e6e Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 20 Jun 2022 12:35:58 +0200 Subject: [PATCH 08/40] Refactoring BTreeReader and binary search code --- .../nu/marginalia/util/btree/BTreeReader.java | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java index 42605c04..5d86c4d2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java @@ -5,6 +5,8 @@ import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapSearcher; +import javax.annotation.CheckReturnValue; + import static java.lang.Math.min; public class BTreeReader { @@ -37,19 +39,22 @@ public class BTreeReader { final long key = keyRaw & ctx.equalityMask(); final long dataAddress = header.dataOffsetLongs(); - if (header.layers() == 0) { // For small data, we only have a data block - return dataSearcher.binarySearch(key, dataAddress, header.numEntries()); - } + final long searchStart; + final long numEntries; - // Search index layers - long dataLayerOffset = searchIndex(header, key); - if (dataLayerOffset < 0) { - return dataLayerOffset; + if (header.layers() == 0) { // For small data, there is no index block, only a flat data block + searchStart = dataAddress; + numEntries = header.numEntries(); } + else { + long dataLayerOffset = searchIndex(header, key); + if (dataLayerOffset < 0) { + return dataLayerOffset; + } - // Search the corresponding data block - final long searchStart = dataAddress + dataLayerOffset * ctx.entrySize(); - final long numEntries = min(header.numEntries() - dataLayerOffset, blockSize); + searchStart = dataAddress + dataLayerOffset * ctx.entrySize(); + numEntries = min(header.numEntries() - dataLayerOffset, blockSize); + } return dataSearcher.binarySearch(key, searchStart, numEntries); } @@ -61,14 +66,15 @@ public class BTreeReader { long layerOffset = 0; for (int i = header.layers() - 1; i >= 0; --i) { - final long layerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset; + final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset; - final long nextLayerOffset = indexSearch(key, indexAddress + layerBlockOffset, blockSize); + final long nextLayerOffset = indexSearch(key, indexAddress + indexLayerBlockOffset, blockSize); if (nextLayerOffset < 0) - return -1; + return nextLayerOffset; - layerOffset = blockSize *(nextLayerOffset + layerOffset); + layerOffset = blockSize * (nextLayerOffset + layerOffset); } + return layerOffset; } From 35878c510244aaac83fca373003a22a5d30d212a Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 22 Jun 2022 12:57:58 +0200 Subject: [PATCH 09/40] Anchor text capture work-in-progress --- .../java/nu/marginalia/util/DenseBitMap.java | 37 ++++ .../nu/marginalia/util/RandomWriteFunnel.java | 2 +- .../nu/marginalia/util/btree/BTreeReader.java | 6 +- .../processing/model/DocumentSentence.java | 37 +++- .../wmsa/edge/converting/ConverterMain.java | 16 +- .../converting/LinkKeywordExtractorMain.java | 194 ++++++++++++++++++ .../processor/logic/LinkParser.java | 9 +- .../conversion/SearchIndexConverter.java | 15 -- .../conversion/words/WordsTableWriter.java | 4 +- .../wmsa/edge/model/EdgeCrawlPlan.java | 11 + .../nu/marginalia/util/DenseBitMapTest.java | 56 +++++ 11 files changed, 343 insertions(+), 44 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java new file mode 100644 index 00000000..39b34048 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java @@ -0,0 +1,37 @@ +package nu.marginalia.util; + +import java.nio.ByteBuffer; + +public class DenseBitMap { + public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8; + + public final long cardinality; + private final ByteBuffer buffer; + + public DenseBitMap(long cardinality) { + this.cardinality = cardinality; + + boolean misaligned = (cardinality & 7) > 0; + this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0))); + } + + public boolean get(long pos) { + return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0; + } + + /** Set the bit indexed by pos, returns + * its previous value. + */ + public boolean set(long pos) { + int offset = (int) (pos >>> 3); + int oldVal = buffer.get(offset); + int mask = (byte) 1 << (int) (pos & 7); + buffer.put(offset, (byte) (oldVal | mask)); + return (oldVal & mask) != 0; + } + + public void clear(long pos) { + int offset = (int)(pos >>> 3); + buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7)))); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java index 0c274c2b..ada8de71 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java @@ -125,7 +125,7 @@ public class RandomWriteFunnel implements AutoCloseable { dest.putLong(addr, data); } catch (IndexOutOfBoundsException ex) { - logger.info("!!!bad[{}]={}", addr, data); + logger.info("Bad poke[{}]={}, this happens if an RWF is allocated with insufficient size", addr, data); } } buffer.compact(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java index 5d86c4d2..388eb175 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java @@ -5,8 +5,6 @@ import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapSearcher; -import javax.annotation.CheckReturnValue; - import static java.lang.Math.min; public class BTreeReader { @@ -68,7 +66,7 @@ public class BTreeReader { for (int i = header.layers() - 1; i >= 0; --i) { final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset; - final long nextLayerOffset = indexSearch(key, indexAddress + indexLayerBlockOffset, blockSize); + final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize); if (nextLayerOffset < 0) return nextLayerOffset; @@ -78,7 +76,7 @@ public class BTreeReader { return layerOffset; } - private long indexSearch(long key, long start, long n) { + private long relativePositionInIndex(long key, long start, long n) { return indexSearcher.binarySearchUpper(key, start, n) - start; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java index b4406954..5630939f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java @@ -2,15 +2,17 @@ package nu.marginalia.util.language.processing.model; import nu.marginalia.util.language.WordPatterns; +import org.jetbrains.annotations.NotNull; import java.lang.ref.SoftReference; import java.util.BitSet; +import java.util.Iterator; import java.util.StringJoiner; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; -public class DocumentSentence { +public class DocumentSentence implements Iterable{ public final String originalSentence; public final String[] words; public final int[] separators; @@ -85,4 +87,37 @@ public class DocumentSentence { public String toString() { return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" ")); } + + @NotNull + @Override + public Iterator iterator() { + return new Iterator<>() { + int i = -1; + @Override + public boolean hasNext() { + return i+1 < length(); + } + + @Override + public SentencePos next() { + return new SentencePos(++i); + } + }; + } + + public class SentencePos { + public final int pos; + + public SentencePos(int pos) { + this.pos = pos; + } + + public String word() { return words[pos]; } + public String wordLowerCase() { return wordsLowerCase[pos]; } + public String posTag() { return posTags[pos]; } + public String stemmed() { return stemmedWords[pos]; } + public int separator() { return separators[pos]; } + public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); } + } } + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java index 5d6f2762..61ff0b00 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java @@ -52,13 +52,6 @@ public class ConverterMain { injector.getInstance(ConverterMain.class); } - private static void requireArgs(String[] args, String... help) { - if (args.length != help.length) { - System.out.println("Usage: " + String.join(", ", help)); - System.exit(255); - } - } - @Inject public ConverterMain( EdgeCrawlPlan plan, @@ -103,7 +96,8 @@ public class ConverterMain { domainToId.forEach((domain, id) -> { String fileName = idToFileName.get(id); - Path dest = getFilePath(plan.crawl.getDir(), fileName); + Path dest = plan.getCrawledFilePath(fileName); + logger.info("{} - {} - {}", domain, id, dest); if (!processLog.isJobFinished(id)) { @@ -128,10 +122,4 @@ public class ConverterMain { record ProcessingInstructions(String id, List instructions) {} - private Path getFilePath(Path dir, String fileName) { - String sp1 = fileName.substring(0, 2); - String sp2 = fileName.substring(2, 4); - return dir.resolve(sp1).resolve(sp2).resolve(fileName); - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java new file mode 100644 index 00000000..63c26200 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -0,0 +1,194 @@ +package nu.marginalia.wmsa.edge.converting; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.util.DenseBitMap; +import nu.marginalia.util.language.WordPatterns; +import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; +import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; +import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; +import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; +import nu.marginalia.wmsa.edge.crawling.WorkLog; +import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import org.jsoup.Jsoup; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class LinkKeywordExtractorMain { + private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class); + + public static void main(String... args) throws IOException { + + if (args.length != 1) { + System.err.println("Arguments: crawl-plan.yaml"); + System.exit(0); + } + var plan = new CrawlPlanLoader().load(Path.of(args[0])); + + Injector injector = Guice.createInjector( + new ConverterModule(plan) + ); + + injector.getInstance(LinkKeywordExtractorMain.class); + } + + private final HashSet crawledDomains = new HashSet<>(); + private final List fileNames = new ArrayList<>(); + private final LinkParser linkParser = new LinkParser(); + private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); + + private final HashFunction hashFunction = Hashing.murmur3_128(); + + // This bit map is used as a bloom filter to deduplicate url-keyword combinations + // false positives are expected, but that's an acceptable trade-off to not have to deal with + // de-duplicating billions of shuffled (url, word) tuples on limited hardware + private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); + + @Inject + public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException { + logger.info("Loading input spec"); + + CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), + spec -> crawledDomains.add(spec.domain)); + + logger.info("Replaying crawl log"); + WorkLog.readLog(plan.crawl.getLogFile(), + entry -> fileNames.add(entry.path())); + + logger.info("Reading files"); + for (var fn : fileNames) { + CrawledDomainReader crawledDomainReader = new CrawledDomainReader(); + var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn)); + if (crawledDomain.doc == null) continue; + + System.out.println("# " + crawledDomain.domain); + + for (var doc : crawledDomain.doc) { + try { + if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) { + processDocument(doc.url, doc.documentBody); + } + } + catch (URISyntaxException ex) { + // This Shouldn't Happen (TM) as the URL that we're failing to process + // is expected to have already been parsed by this code successfully + // in the process of getting here. + // + // But also, if it does happen, it's no big deal + + logger.warn("Bad URL format", ex); + } + } + } + } + + private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+"); + + private void processDocument(String docUrl, String documentBody) throws URISyntaxException { + var processed = Jsoup.parse(documentBody); + + EdgeUrl documentUrl = new EdgeUrl(docUrl); + + for (var link : processed.getElementsByTag("a")) { + if (link.hasAttr("href")) { + String href = link.attr("href"); + String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim(); + + processAnchor(documentUrl, href, text); + } + } + } + + private void processAnchor(EdgeUrl documentUrl, String href, String text) { + if (!isInterestingAnchorText(text)) { + return; + } + + var optLinkUrl = linkParser.parseLink(documentUrl, href); + if (optLinkUrl.isEmpty()) return; + + var linkUrl = optLinkUrl.get(); + + if (!isInterestingAnchorLink(linkUrl)) { + return; + } + + DocumentLanguageData languageData = sentenceExtractor.extractSentences(text); + for (var sent : languageData.sentences) { + for (var wordPos : sent) { + if (wordPos.isStopWord()) + continue; + + String word = wordPos.wordLowerCase(); + if (!WordPatterns.wordQualitiesPredicate.test(word) || !WordPatterns.filter(word)) + continue; + + + if (!linkUrl.domain.equals(documentUrl.domain)) { + if (isNewKeywordForLink(word, linkUrl.toString())) { + System.out.println(linkUrl + "\t" + word); + } + } + } + } + } + + // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine + private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); + + private boolean isInterestingAnchorText(String text) { + if (text.isBlank()) return false; + if (text.length() > 32) return false; + + // Google loves questions, and so does SEO spammers + if (text.endsWith("?")) return false; + + if (text.startsWith("http:") || text.startsWith("https:")) return false; + + if (looksLikeAnURL.test(text)) return false; + + return switch (text) { + case "this", "here", "click", "click here", "download", "source" -> false; + default -> true; + }; + } + + private boolean isInterestingAnchorLink(EdgeUrl linkUrl) { + if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) { + return false; + } + + return crawledDomains.contains(linkUrl.domain.toString()); + } + + private boolean isNewKeywordForLink(String href, String text) { + long hash = 0; + + hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong(); + hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong(); + + // Remove sign bit because we don't want a negative index in deduplicateHashBitset + hash &= 0x7FFF_FFFF_FFFF_FFFFL; + + return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index 378182f2..0a2bdf45 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -145,13 +145,8 @@ public class LinkParser { } private boolean isRelRelevant(String rel) { - if (null == rel) { - return true; - } - return switch (rel) { - case "noindex" -> false; - default -> true; - }; + // this is null safe + return !"noindex".equalsIgnoreCase(rel); } private boolean isUrlRelevant(String href) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java index 2d12d0f4..afa319f4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java @@ -91,18 +91,13 @@ public class SearchIndexConverter { } } - - private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader, File outputFileWords) throws IOException { final int topWord = (int) journalReader.fileHeader.wordCount(); - logger.debug("Table size = {}", topWord); WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord); - logger.debug("Reading words"); - for (var entry : journalReader) { if (!isRelevantEntry(entry)) { continue; @@ -119,8 +114,6 @@ public class SearchIndexConverter { } } - logger.debug("Rearranging table"); - wordsTableWriter.write(outputFileWords); return wordsTableWriter.getTable(); @@ -130,15 +123,12 @@ public class SearchIndexConverter { Path tmpUrlsFile, WordIndexOffsetsTable wordOffsetsTable) throws IOException { - logger.info("Table size = {}", wordOffsetsTable.length()); - long numberOfWordsTotal = 0; for (var entry : journalReader) { if (isRelevantEntry(entry)) numberOfWordsTotal += entry.wordCount(); } - try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) { @@ -168,7 +158,6 @@ public class SearchIndexConverter { } } - rwf.write(urlsTmpFileChannel); } @@ -176,8 +165,6 @@ public class SearchIndexConverter { try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) { if (wordOffsetsTable.length() > 0) { - logger.info("Sorting urls table"); - var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); wordOffsetsTable.forEachRange(urlTmpFileSorter::sort); @@ -188,7 +175,6 @@ public class SearchIndexConverter { } } - logger.info("Writing BTree"); try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) { var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); @@ -206,7 +192,6 @@ public class SearchIndexConverter { } } - private long translateUrl(long url) { int domainId = partitioner.translateId(bucketId, (int) (url >>> 32)); return ((long)domainId << 32) | (url & 0xFFFFFFFFL); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java index 7f762ff3..15ad0cd3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java @@ -43,11 +43,11 @@ public class WordsTableWriter { var writer = new BTreeWriter(mmf, wordsBTreeContext); - writer.write(offset, tableSize, this::writeBTreeBlock); + writer.write(offset, tableSize, this::writeBTreeDataBlock); } } - private void writeBTreeBlock(MultimapFileLongSlice mapSlice) { + private void writeBTreeDataBlock(MultimapFileLongSlice mapSlice) { long urlFileOffset = 0; int idx = 0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java index 4e237908..264c1051 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java @@ -27,4 +27,15 @@ public class EdgeCrawlPlan { } } + public Path getCrawledFilePath(String fileName) { + String sp1 = fileName.substring(0, 2); + String sp2 = fileName.substring(2, 4); + return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName); + } + + public Path getProcessedFilePath(String fileName) { + String sp1 = fileName.substring(0, 2); + String sp2 = fileName.substring(2, 4); + return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName); + } } diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java new file mode 100644 index 00000000..20857947 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java @@ -0,0 +1,56 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class DenseBitMapTest { + + @Test + public void testSetAll() { + var dbm = new DenseBitMap(129); + for (int i = 0; i < dbm.cardinality; i++) { + dbm.set(i); + } + + for (int i = 0; i < dbm.cardinality; i++) { + assertTrue(dbm.get(i)); + } + } + + @Test + public void testSetEven() { + var dbm = new DenseBitMap(131); + for (int i = 0; i < dbm.cardinality; i+=2) { + dbm.set(i); + } + + for (int i = 0; i < dbm.cardinality; i+=2) { + assertTrue(dbm.get(i)); + } + + for (int i = 1; i < dbm.cardinality; i+=2) { + assertFalse(dbm.get(i)); + } + } + + @Test + public void testSetAllClearSome() { + var dbm = new DenseBitMap(129); + + for (int i = 0; i < dbm.cardinality; i++) { + dbm.set(i); + } + for (int i = 1; i < dbm.cardinality; i+=2) { + dbm.clear(i); + } + + for (int i = 0; i < dbm.cardinality; i+=2) { + assertTrue(dbm.get(i), "Expected " + i + " to be set"); + } + + for (int i = 1; i < dbm.cardinality; i+=2) { + assertFalse(dbm.get(i), "Expected " + i + " to be clear"); + } + } +} \ No newline at end of file From 48e4aa3ee848e44ab7868adbea51ac9341990e36 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 22 Jun 2022 13:01:46 +0200 Subject: [PATCH 10/40] Clean up old junk from the WordPatterns class --- .../util/language/WordPatterns.java | 22 +++---------------- .../converting/LinkKeywordExtractorMain.java | 4 ++-- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java index 391558f4..3a95072b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java @@ -3,7 +3,9 @@ package nu.marginalia.util.language; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; -import java.util.*; +import java.util.HashSet; +import java.util.Objects; +import java.util.Set; import java.util.function.Predicate; import java.util.regex.Pattern; @@ -13,21 +15,13 @@ public class WordPatterns { public static final String WORD_TOKEN_JOINER = "_"; public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?"); - public static final Pattern wordPatternRestrictive = Pattern.compile("[#]?[@a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?"); - public static final Pattern keyWordPattern = Pattern.compile("[A-Z\\u00C0-\\u00D6][_a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{0,32}('[a-zA-Z])?"); public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?"); - public static final Pattern joinWord = Pattern.compile("(as|an|the|of|in|a)"); - public static final Pattern keywordAppendixPattern = Pattern.compile("([0-9A-Z][A-Z0-9]{0,3})"); public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$"); public static final Predicate wordQualitiesPredicate = wordPattern.asMatchPredicate(); - public static final Predicate restrictivePredicate = wordPatternRestrictive.asMatchPredicate(); public static final Predicate wordAppendixPredicate = wordAppendixPattern.asMatchPredicate(); - public static final Predicate keywordPredicate = keyWordPattern.asMatchPredicate(); - public static final Predicate keywordAppendixPredicate = keywordAppendixPattern.asMatchPredicate(); public static final Predicate wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate); - public static final Predicate keywordPredicateEither = keywordPredicate.or(keywordAppendixPredicate); public static final Predicate characterNoisePredicate = characterNoisePattern.asMatchPredicate(); public static final Set topWords; @@ -88,16 +82,6 @@ public class WordPatterns { return true; } - public static boolean filterStrict(String word) { - - int numDigits = (int) word.chars().filter(Character::isDigit).count(); - if (numDigits == word.length()) { - return false; - } - - return true; - } - public static boolean isStopWord(String s) { if (s.length() < MIN_WORD_LENGTH) { return true; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index 63c26200..f60541e3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -139,9 +139,9 @@ public class LinkKeywordExtractorMain { continue; String word = wordPos.wordLowerCase(); - if (!WordPatterns.wordQualitiesPredicate.test(word) || !WordPatterns.filter(word)) - continue; + if (!WordPatterns.filter(word)) + continue; if (!linkUrl.domain.equals(documentUrl.domain)) { if (isNewKeywordForLink(word, linkUrl.toString())) { From 4516b23f90806671dda7256c7891642f5f29f839 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 22 Jun 2022 13:12:44 +0200 Subject: [PATCH 11/40] Also grab alt text for images in a-tags in anchor text extractor --- .../converting/LinkKeywordExtractorMain.java | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index f60541e3..570c47b5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -19,6 +19,8 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -101,23 +103,42 @@ public class LinkKeywordExtractorMain { } } - private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+"); private void processDocument(String docUrl, String documentBody) throws URISyntaxException { - var processed = Jsoup.parse(documentBody); - - EdgeUrl documentUrl = new EdgeUrl(docUrl); + final Document processed = Jsoup.parse(documentBody); + final EdgeUrl documentUrl = new EdgeUrl(docUrl); for (var link : processed.getElementsByTag("a")) { if (link.hasAttr("href")) { String href = link.attr("href"); - String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim(); + String text = getLinkText(link); processAnchor(documentUrl, href, text); } } } + private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+"); + + private String getLinkText(Element link) { + String text = link.text(); + + if (link.text().isBlank()) { + text = getLinkTextByImgAltTag(link); + } + + return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim(); + } + + private String getLinkTextByImgAltTag(Element link) { + for (var img: link.getElementsByTag("img")) { + if (img.hasAttr("alt")) { + return img.attr("alt"); + } + } + return ""; + } + private void processAnchor(EdgeUrl documentUrl, String href, String text) { if (!isInterestingAnchorText(text)) { return; From e1b34771156a6f2c64ad71993a35213e6424129f Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 23 Jun 2022 17:02:28 +0200 Subject: [PATCH 12/40] Experiments in keyword extraction --- .../converting/LinkKeywordExtractorMain.java | 294 ++++++++---------- .../converting/atags/AnchorTextExtractor.java | 149 +++++++++ .../java/org/openzim/ZIMTypes/ZIMReader.java | 206 +----------- 3 files changed, 292 insertions(+), 357 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index 570c47b5..792dac6f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -1,215 +1,193 @@ package nu.marginalia.wmsa.edge.converting; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import com.google.inject.Guice; -import com.google.inject.Inject; -import com.google.inject.Injector; -import nu.marginalia.util.DenseBitMap; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.util.language.processing.SentenceExtractor; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; +import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; import nu.marginalia.wmsa.edge.crawling.WorkLog; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader; +import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedOutputStream; +import java.io.FileOutputStream; import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; +import java.io.OutputStream; +import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Objects; -import java.util.function.Predicate; -import java.util.regex.Pattern; public class LinkKeywordExtractorMain { private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class); - public static void main(String... args) throws IOException { + public static void main(String... args) throws IOException, InterruptedException { - if (args.length != 1) { - System.err.println("Arguments: crawl-plan.yaml"); + if (args.length < 2) { + System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]"); System.exit(0); } - var plan = new CrawlPlanLoader().load(Path.of(args[0])); - Injector injector = Guice.createInjector( - new ConverterModule(plan) - ); + String command = args[0]; + var plan = new CrawlPlanLoader().load(Path.of(args[1])); + + switch (command) { + case "crawl": getKeywordsFromCrawl(plan); break; + case "so": getKeywordsFromSo(plan, args[2]); break; + case "wiki": getKeywordsFromWiki(plan, args[2]); break; + default: System.err.println("Unrecognized command"); + } - injector.getInstance(LinkKeywordExtractorMain.class); } - private final HashSet crawledDomains = new HashSet<>(); - private final List fileNames = new ArrayList<>(); - private final LinkParser linkParser = new LinkParser(); - private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); + private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException { - private final HashFunction hashFunction = Hashing.murmur3_128(); - // This bit map is used as a bloom filter to deduplicate url-keyword combinations - // false positives are expected, but that's an acceptable trade-off to not have to deal with - // de-duplicating billions of shuffled (url, word) tuples on limited hardware - private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); + HashSet crawledDomains = new HashSet<>(); + TIntHashSet crawledUrls = new TIntHashSet(50_000_000); + + logger.info("Loading URLs"); + Files.lines(Path.of("/home/vlofgren/good-urls3.txt")) + .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange")) + .mapToInt(String::hashCode) + .forEach(crawledUrls::add); + + logger.info("Loading input spec"); + CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), + spec -> { crawledDomains.add(spec.domain); }); + + try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { + AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain) + && !domain.contains("wiki") + && !domain.contains("isni") + && !domain.contains("wiktionary"), + url -> crawledUrls.contains(url.toString().hashCode()), + output::write); + + new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> { + anchorTextExtractor.processDocument(article.getUrl().toString(), article.body); + }).join(); + } + catch (IOException ex) { + ex.printStackTrace(); + } + + + + } + + private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException { + TIntHashSet crawledUrls = new TIntHashSet(50_000_000); + + logger.info("Loading URLs"); + Files.lines(Path.of("/home/vlofgren/good-urls3.txt")) + .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange")) + .mapToInt(String::hashCode) + .forEach(crawledUrls::add); - @Inject - public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException { logger.info("Loading input spec"); + HashSet crawledDomains = new HashSet<>(); CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), spec -> crawledDomains.add(spec.domain)); + crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here + crawledDomains.remove("jsbin.com"); + crawledDomains.remove("codepad.org"); + + + try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { + AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, + url -> crawledUrls.contains(url.toString().hashCode()), + output::write); + + new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> { + anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody); + }).join(); + } + catch (IOException ex) { + ex.printStackTrace(); + } + } + + + public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException { + + TIntHashSet crawledUrls = new TIntHashSet(50_000_000); + + logger.info("Loading URLs"); + Files.lines(Path.of("/home/vlofgren/good-urls3.txt")) + .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange")) + .mapToInt(String::hashCode) + .forEach(crawledUrls::add); + + + logger.info("Loading input spec"); + + HashSet crawledDomains = new HashSet<>(); + CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), + spec -> crawledDomains.add(spec.domain)); + + List fileNames = new ArrayList<>(); + logger.info("Replaying crawl log"); WorkLog.readLog(plan.crawl.getLogFile(), entry -> fileNames.add(entry.path())); - logger.info("Reading files"); - for (var fn : fileNames) { - CrawledDomainReader crawledDomainReader = new CrawledDomainReader(); - var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn)); - if (crawledDomain.doc == null) continue; + try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { + AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, + url -> crawledUrls.contains(url.toString().hashCode()), + output::write); - System.out.println("# " + crawledDomain.domain); + logger.info("Reading files"); + for (var fn : fileNames) { + CrawledDomainReader crawledDomainReader = new CrawledDomainReader(); + var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn)); + if (crawledDomain.doc == null) continue; - for (var doc : crawledDomain.doc) { - try { + System.out.println("# " + crawledDomain.domain); + + for (var doc : crawledDomain.doc) { if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) { - processDocument(doc.url, doc.documentBody); - } - } - catch (URISyntaxException ex) { - // This Shouldn't Happen (TM) as the URL that we're failing to process - // is expected to have already been parsed by this code successfully - // in the process of getting here. - // - // But also, if it does happen, it's no big deal - - logger.warn("Bad URL format", ex); - } - } - } - } - - - private void processDocument(String docUrl, String documentBody) throws URISyntaxException { - final Document processed = Jsoup.parse(documentBody); - final EdgeUrl documentUrl = new EdgeUrl(docUrl); - - for (var link : processed.getElementsByTag("a")) { - if (link.hasAttr("href")) { - String href = link.attr("href"); - String text = getLinkText(link); - - processAnchor(documentUrl, href, text); - } - } - } - - private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+"); - - private String getLinkText(Element link) { - String text = link.text(); - - if (link.text().isBlank()) { - text = getLinkTextByImgAltTag(link); - } - - return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim(); - } - - private String getLinkTextByImgAltTag(Element link) { - for (var img: link.getElementsByTag("img")) { - if (img.hasAttr("alt")) { - return img.attr("alt"); - } - } - return ""; - } - - private void processAnchor(EdgeUrl documentUrl, String href, String text) { - if (!isInterestingAnchorText(text)) { - return; - } - - var optLinkUrl = linkParser.parseLink(documentUrl, href); - if (optLinkUrl.isEmpty()) return; - - var linkUrl = optLinkUrl.get(); - - if (!isInterestingAnchorLink(linkUrl)) { - return; - } - - DocumentLanguageData languageData = sentenceExtractor.extractSentences(text); - for (var sent : languageData.sentences) { - for (var wordPos : sent) { - if (wordPos.isStopWord()) - continue; - - String word = wordPos.wordLowerCase(); - - if (!WordPatterns.filter(word)) - continue; - - if (!linkUrl.domain.equals(documentUrl.domain)) { - if (isNewKeywordForLink(word, linkUrl.toString())) { - System.out.println(linkUrl + "\t" + word); + anchorTextExtractor.processDocument(doc.url, doc.documentBody); } } } } + } - // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine - private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); + private static class UrlKeywordTsvWriter implements AutoCloseable { - private boolean isInterestingAnchorText(String text) { - if (text.isBlank()) return false; - if (text.length() > 32) return false; + private final OutputStream stream; - // Google loves questions, and so does SEO spammers - if (text.endsWith("?")) return false; - - if (text.startsWith("http:") || text.startsWith("https:")) return false; - - if (looksLikeAnURL.test(text)) return false; - - return switch (text) { - case "this", "here", "click", "click here", "download", "source" -> false; - default -> true; - }; - } - - private boolean isInterestingAnchorLink(EdgeUrl linkUrl) { - if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) { - return false; + UrlKeywordTsvWriter(Path outputFile) throws IOException { + this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile())); } - return crawledDomains.contains(linkUrl.domain.toString()); + void write(EdgeUrl url, String keyword) { + try { + stream.write(url.toString().getBytes()); + stream.write('\t'); + stream.write(keyword.getBytes()); + stream.write('\n'); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() throws IOException { + stream.close(); + } } - private boolean isNewKeywordForLink(String href, String text) { - long hash = 0; - - hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong(); - hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong(); - - // Remove sign bit because we don't want a negative index in deduplicateHashBitset - hash &= 0x7FFF_FFFF_FFFF_FFFFL; - - return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality); - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java new file mode 100644 index 00000000..c96fd400 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java @@ -0,0 +1,149 @@ +package nu.marginalia.wmsa.edge.converting.atags; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import lombok.SneakyThrows; +import nu.marginalia.util.DenseBitMap; +import nu.marginalia.util.language.WordPatterns; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.nio.charset.StandardCharsets; +import java.util.function.BiConsumer; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class AnchorTextExtractor { + private final Predicate includeDomainPredicate; + private final Predicate includeUrlPredicate; + private final BiConsumer linkKeywordConsumer; + + private final LinkParser linkParser = new LinkParser(); + + private final HashFunction hashFunction = Hashing.murmur3_128(); + + // This bit map is used as a bloom filter to deduplicate url-keyword combinations + // false positives are expected, but that's an acceptable trade-off to not have to deal with + // de-duplicating billions of shuffled (url, word) tuples on limited hardware + private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); + + public AnchorTextExtractor(Predicate includeDomainPredicate, + Predicate includeUrlPredicate, + BiConsumer linkKeywordConsumer) { + this.includeDomainPredicate = includeDomainPredicate; + this.includeUrlPredicate = includeUrlPredicate; + this.linkKeywordConsumer = linkKeywordConsumer; + } + + @SneakyThrows + public void processDocument(String docUrl, String documentBody) { + final Document processed = Jsoup.parse(documentBody); + final EdgeUrl documentUrl = new EdgeUrl(docUrl); + + for (var link : processed.getElementsByTag("a")) { + if (link.hasAttr("href")) { + String href = link.attr("href"); + String text = getLinkText(link); + + processAnchor(documentUrl, href, text); + } + } + } + + private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+"); + + private String getLinkText(Element link) { + String text = link.text(); + + if (link.text().isBlank()) { + for (var img: link.getElementsByTag("img")) { + if (img.hasAttr("alt")) { + text = img.attr("alt"); + break; + } + } + } + + return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim(); + } + + private void processAnchor(EdgeUrl documentUrl, String href, String text) { + if (!isInterestingAnchorText(text)) { + return; + } + if (href.contains("?")) { + return; + } + + var optLinkUrl = linkParser.parseLink(documentUrl, href); + if (optLinkUrl.isEmpty()) return; + + var linkUrl = optLinkUrl.get(); + + if (!isInterestingAnchorLink(linkUrl)) { + return; + } + + for (String word: anchorTextNoise.split(text)) { + if (WordPatterns.isStopWord(word)) + continue; + + word = word.toLowerCase(); + if (!WordPatterns.filter(word)) + continue; + + if (!linkUrl.domain.equals(documentUrl.domain)) { + if (isNewKeywordForLink(word, linkUrl.toString())) { + linkKeywordConsumer.accept(linkUrl, word); + } + } + } + } + + // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine + private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); + + private boolean isInterestingAnchorText(String text) { + if (text.isBlank()) return false; + if (text.length() > 32) return false; + + // Google loves questions, and so does SEO spammers + if (text.endsWith("?")) return false; + + if (text.startsWith("http:") || text.startsWith("https:")) return false; + + if (looksLikeAnURL.test(text)) return false; + + return switch (text) { + case "this", "here", "click", "click here", "download", "source" -> false; + default -> true; + }; + } + + private boolean isInterestingAnchorLink(EdgeUrl linkUrl) { + if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) { + return false; + } + + if (!includeUrlPredicate.test(linkUrl)) { + return false; + } + + return includeDomainPredicate.test(linkUrl.domain.toString()); + } + + private boolean isNewKeywordForLink(String href, String text) { + long hash = 0; + + hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong(); + hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong(); + + // Remove sign bit because we don't want a negative index in deduplicateHashBitset + hash &= 0x7FFF_FFFF_FFFF_FFFFL; + + return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality); + } +} diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java index d97c3c73..7706e8d1 100644 --- a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java +++ b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java @@ -18,20 +18,20 @@ package org.openzim.ZIMTypes; -import java.io.*; -import java.util.*; -import java.util.function.BiConsumer; -import java.util.function.Consumer; -import java.util.function.Predicate; - import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; import lombok.AllArgsConstructor; import lombok.Getter; import org.jetbrains.annotations.NotNull; -import org.tukaani.xz.SingleXZInputStream; import org.openzim.util.RandomAcessFileZIMInputStream; import org.openzim.util.Utilities; +import org.tukaani.xz.SingleXZInputStream; + +import java.io.*; +import java.util.*; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Predicate; /** * @author Arunesh Mathur @@ -401,198 +401,6 @@ public class ZIMReader { } - public String getArticleData(DirectoryEntry mainEntry) throws IOException { - - byte[] buffer = new byte[8]; - - if (mainEntry != null) { - - // Check what kind of an entry was mainEnrty - if (mainEntry.getClass() == ArticleEntry.class) { - - // Cast to ArticleEntry - ArticleEntry article = (ArticleEntry) mainEntry; - - // Get the cluster and blob numbers from the article - long clusterNumber = article.getClusterNumber(); - int blobNumber = article.getBlobnumber(); - - // Move to the cluster entry in the clusterPtrPos - mReader.seek(mFile.getClusterPtrPos() + clusterNumber * 8); - - // Read the location of the cluster - long clusterPos = mReader - .readEightLittleEndianBytesValue(buffer); - - // Move to the cluster - mReader.seek(clusterPos); - - // Read the first byte, for compression information - int compressionType = mReader.read(); - - // Reference declaration - SingleXZInputStream xzReader = null; - int firstOffset, numberOfBlobs, offset1, - offset2, - location, - differenceOffset; - - ByteArrayOutputStream baos; - - // Check the compression type that was read - switch (compressionType) { - - // TODO: Read uncompressed data directly - case 0: - case 1: - - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - - // Read the first offset - mReader.read(buffer); - - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); - - // The number of blobs - numberOfBlobs = firstOffset / 4; - - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; - - - if (blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - - location = (blobNumber - 1) * 4; - Utilities.skipFully(mReader, location); - mReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } - - mReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); - - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - - Utilities.skipFully(mReader, - (offset1 - 4 * (blobNumber + 2))); - - mReader.read(buffer, 0, differenceOffset); - - return new String(buffer); - - // LZMA2 compressed data - case 4: - - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - xzReader = new SingleXZInputStream(mReader, 4194304); - - // Read the first offset - xzReader.read(buffer); - - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); - - // The number of blobs - numberOfBlobs = firstOffset / 4; - - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; - - if(blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - - location = (blobNumber - 1) * 4; - Utilities.skipFully(xzReader, location); - xzReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } - - xzReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); - - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - - Utilities.skipFully(xzReader, - (offset1 - 4 * (blobNumber + 2))); - - xzReader.read(buffer, 0, differenceOffset); - return new String(buffer); - - case 5: - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - var zstdInputStream = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(mReader)); - - // Read the first offset - zstdInputStream.read(buffer); - - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); - - // The number of blobs - numberOfBlobs = firstOffset / 4; - - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; - - if(blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - - location = (blobNumber - 1) * 4; - Utilities.skipFully(zstdInputStream, location); - zstdInputStream.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } - - zstdInputStream.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); - - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - - Utilities.skipFully(zstdInputStream, - (offset1 - 4 * (blobNumber + 2))); - - zstdInputStream.read(buffer, 0, differenceOffset); - - return new String(buffer); - - default: - System.err.print("What is compression = " + compressionType); - - } - - } - } - - return null; - - } - public DirectoryEntry getDirectoryInfoAtTitlePosition(long position) throws IOException { From ee07c4d94ae12b8a988dbcdff8bf4032004245bf Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sun, 26 Jun 2022 16:44:08 +0200 Subject: [PATCH 13/40] Refactored s/DictionaryWriter/KeywordLexicon/g to use significantly less memory and (potentially) support UTF-8. --- .../marginalia/util/dict/DictionaryData.java | 144 ++----- .../util/dict/DictionaryHashMap.java | 54 +-- .../wmsa/edge/index/EdgeIndexService.java | 31 +- .../wmsa/edge/index/IndexServicesFactory.java | 39 +- .../conversion/SearchIndexConverter.java | 20 +- .../index/dictionary/DictionaryWriter.java | 367 ------------------ .../index/dictionary/TokenCompressor.java | 83 ---- .../journal/SearchIndexJournalReader.java | 5 +- .../journal/SearchIndexJournalWriter.java | 3 + .../journal/SearchIndexJournalWriterImpl.java | 8 +- .../{ => model}/SearchIndexJournalEntry.java | 2 +- .../SearchIndexJournalEntryHeader.java | 2 +- .../SearchIndexJournalFileHeader.java | 2 +- .../edge/index/lexicon/KeywordLexicon.java | 117 ++++++ .../KeywordLexiconReadOnlyView.java} | 13 +- .../journal/KeywordLexiconJournal.java | 69 ++++ .../KeywordLexiconJournalCommitQueue.java | 41 ++ .../journal/KeywordLexiconJournalFile.java | 157 ++++++++ .../wmsa/edge/index/reader/SearchIndexes.java | 12 +- .../index/service/DictionaryWriterTest.java | 164 ++++---- .../service/SearchIndexJournalWriterTest.java | 21 +- .../index/service/TokenCompressorTest.java | 28 -- .../service/util/DictionaryDataTest.java | 28 +- .../service/util/DictionaryHashMapTest.java | 115 +++--- 24 files changed, 671 insertions(+), 854 deletions(-) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/{ => model}/SearchIndexJournalEntry.java (95%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/{ => model}/SearchIndexJournalEntryHeader.java (90%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/{ => model}/SearchIndexJournalFileHeader.java (59%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{dictionary/DictionaryReader.java => lexicon/KeywordLexiconReadOnlyView.java} (61%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java index 847259db..c36c10d2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java @@ -5,7 +5,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.ByteBuffer; -import java.util.Arrays; +import java.nio.LongBuffer; public class DictionaryData { @@ -17,22 +17,22 @@ public class DictionaryData { public DictionaryData(int bankSize) { DICTIONARY_BANK_SIZE = bankSize; - banks.add(new DictionaryDataBank(0)); + banks.add(new DictionaryDataBank(0, bankSize)); } public int size() { return banks.end(); } - public int add(byte[] data, int value) { + public int add(long key) { var activeBank = banks.last(); - int rb = activeBank.add(data, value); + int rb = activeBank.add(key); if (rb == -1) { int end = activeBank.getEnd(); logger.debug("Switching bank @ {}", end); - var newBank = new DictionaryDataBank(end); - rb = newBank.add(data, value); + var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE); + rb = newBank.add(key); banks.add(newBank); } @@ -41,33 +41,30 @@ public class DictionaryData { } - public byte[] getBytes(int offset) { - return banks.bankForOffset(offset).getBytes(offset); + public long getKey(int offset) { + return banks.bankForOffset(offset).getKey(offset); } - public boolean keyEquals(int offset, byte[] data) { - return banks.bankForOffset(offset).keyEquals(offset, data); + public boolean keyEquals(int offset, long otherKey) { + return banks.bankForOffset(offset).keyEquals(offset, otherKey); } - public int getValue(int offset) { - return banks.bankForOffset(offset).getValue(offset); - } - - public class DictionaryDataBank { + private static class DictionaryDataBank { private final int start_idx; - private final ByteBuffer data; + + // Humongous long-lived arrays seem to sometimes yield considerable memory overhead and + // can make the GC behave poorly. Using off-heap memory seems preferred when their + // lifetime is "forever" + + private final LongBuffer keys; private int size; - private int[] offset; - private int[] value; - public DictionaryDataBank(int start_idx) { + + public DictionaryDataBank(int start_idx, int sz) { this.start_idx = start_idx; - data = ByteBuffer.allocateDirect(DICTIONARY_BANK_SIZE); - - offset = new int[DICTIONARY_BANK_SIZE/16]; - value = new int[DICTIONARY_BANK_SIZE/16]; + keys = ByteBuffer.allocateDirect(8*sz).asLongBuffer(); size = 0; } @@ -79,102 +76,23 @@ public class DictionaryData { return start_idx + size; } - public byte[] getBytes(int idx) { + public long getKey(int idx) { + if (idx < start_idx || idx - start_idx >= size) { + throw new IndexOutOfBoundsException(idx); + } + return keys.get(idx - start_idx); + } + + public boolean keyEquals(int idx, long other) { if (idx < start_idx || idx - start_idx >= size) { throw new IndexOutOfBoundsException(idx); } - idx = idx - start_idx; - - final int start; - final int end = offset[idx]; - - if (idx == 0) start = 0; - else start = offset[idx-1]; - - byte[] dst = new byte[end-start]; - data.get(start, dst); - return dst; + return keys.get(idx - start_idx) == other; } - public int getValue(int idx) { - if (idx < start_idx || idx - start_idx >= size) { - throw new IndexOutOfBoundsException(idx); - } - return value[idx - start_idx]; - } - - public boolean keyEquals(int idx, byte[] data) { - if (idx < start_idx || idx - start_idx >= size) { - throw new IndexOutOfBoundsException(idx); - } - - idx = idx - start_idx; - int start; - int end = offset[idx]; - - if (idx == 0) { - start = 0; - } - else { - start = offset[idx-1]; - } - if (data.length != end - start) { - return false; - } - for (int i = 0; i < data.length; i++) { - if (this.data.get(start + i) != data[i]) { - return false; - } - } - return true; - } - - public long longHashCode(int idx) { - if (idx < start_idx || idx - start_idx >= size) { - throw new IndexOutOfBoundsException(idx); - } - - idx = idx - start_idx; - int start; - int end = offset[idx]; - - if (idx == 0) { - start = 0; - } - else { - start = offset[idx-1]; - } - - long result = 1; - for (int i = start; i < end; i++) - result = 31 * result + data.get(i); - - return result; - } - - public int add(byte[] newData, int newValue) { - if (size == offset.length) { - logger.debug("Growing bank from {} to {}", offset.length, offset.length*2); - offset = Arrays.copyOf(offset, offset.length*2); - value = Arrays.copyOf(value, value.length*2); - } - - if (size > 0 && offset[size-1]+newData.length >= DICTIONARY_BANK_SIZE) { - if (offset.length > size+1) { - logger.debug("Shrinking bank from {} to {}", offset.length, size - 1); - offset = Arrays.copyOf(offset, size + 1); - value = Arrays.copyOf(value, size + 1); - } - return -1; // Full - } - - int dataOffset = size > 0 ? offset[size-1] : 0; - - data.put(dataOffset, newData); - - offset[size] = dataOffset + newData.length; - value[size] = newValue; + public int add(long newKey) { + keys.put(size, newKey); return start_idx + size++; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java index d655b6a2..5544545a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java @@ -82,9 +82,6 @@ public class DictionaryHashMap { } } - public int memSz() { - return dictionaryData.size(); - } public int size() { return sz.get(); } @@ -101,20 +98,20 @@ public class DictionaryHashMap { buffers[buffer].put(bufferIdx, val); } - public int put(byte[] data, int value) { + public int put(long key) { - long hash = longHash(data) & 0x7FFF_FFFF_FFFF_FFFFL; + long hash = key & 0x7FFF_FFFF_FFFF_FFFFL; long idx = hash % hashTableSize; if (getCell(idx) == NO_VALUE) { - return setValue(data, value, idx); + return setValue(key, idx); } - return putRehash(data, value, idx, hash); + return putRehash(key, idx, hash); } - private int putRehash(byte[] data, int value, long idx, long hash) { + private int putRehash(long key, long idx, long hash) { final long pStride = 1 + (hash % (hashTableSize - 2)); for (long j = 1; j < maxProbeLength; j++) { @@ -129,9 +126,9 @@ public class DictionaryHashMap { if (val == NO_VALUE) { probe_count_metrics.set(j); - return setValue(data, value, idx); + return setValue(key, idx); } - else if (dictionaryData.keyEquals(val, data)) { + else if (dictionaryData.keyEquals(val, key)) { return val; } } @@ -139,16 +136,16 @@ public class DictionaryHashMap { throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%"); } - private int setValue(byte[] data, int value, long cell) { + private int setValue(long key, long cell) { sz.incrementAndGet(); - int di = dictionaryData.add(data, value); + int di = dictionaryData.add(key); setCell(cell, di); return di; } - public int get(byte[] data) { - final long hash = longHash(data) & 0x7FFF_FFFF_FFFF_FFFFL; + public int get(long key) { + final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL; final long cell = hash % hashTableSize; if (getCell(cell) == NO_VALUE) { @@ -157,15 +154,15 @@ public class DictionaryHashMap { else { int val = getCell(cell); - if (dictionaryData.keyEquals(val, data)) { - return dictionaryData.getValue(val); + if (dictionaryData.keyEquals(val, key)) { + return val; } } - return getRehash(data, cell, hash); + return getRehash(key, cell, hash); } - private int getRehash(byte[] data, long idx, long hash) { + private int getRehash(long key, long idx, long hash) { final long pStride = 1 + (hash % (hashTableSize - 2)); for (long j = 1; j < maxProbeLength; j++) { @@ -180,29 +177,12 @@ public class DictionaryHashMap { if (val == NO_VALUE) { return NO_VALUE; } - else if (dictionaryData.keyEquals(val, data)) { - return dictionaryData.getValue(val); + else if (dictionaryData.keyEquals(val, key)) { + return val; } } throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%"); } - private long longHash(byte[] bytes) { - if (bytes == null) - return 0; - - // https://cp-algorithms.com/string/string-hashing.html - int p = 127; - long m = (1L<<61)-1; - long p_power = 1; - long hash_val = 0; - - for (byte element : bytes) { - hash_val = (hash_val + (element+1) * p_power) % m; - p_power = (p_power * p) % m; - } - return hash_val; - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index 829a59af..96f1fb72 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -13,18 +13,22 @@ import io.prometheus.client.Histogram; import io.reactivex.rxjava3.schedulers.Schedulers; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; import nu.marginalia.util.ListChunker; +import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.model.*; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; +import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; -import nu.marginalia.util.dict.DictionaryHashMap; -import nu.marginalia.wmsa.edge.model.*; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.search.*; @@ -53,7 +57,7 @@ public class EdgeIndexService extends Service { @NotNull private final Initialization init; private final SearchIndexes indexes; - private final DictionaryWriter dictionaryWriter; + private final KeywordLexicon keywordLexicon; private final Gson gson = new GsonBuilder() .registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create()) @@ -80,7 +84,7 @@ public class EdgeIndexService extends Service { this.init = init; this.indexes = indexes; - this.dictionaryWriter = servicesFactory.getDictionaryWriter(); + this.keywordLexicon = servicesFactory.getKeywordLexicon(); Spark.post("/words/", this::putWords); Spark.post("/search/", this::search, gson::toJson); @@ -186,15 +190,18 @@ public class EdgeIndexService extends Service { for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) { - var entry = new SearchIndexJournalEntry(getWordIds(chunk)); + var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block); indexWriter.put(header, entry); }; } - private long[] getWordIds(List words) { - return words.stream().filter(w -> w.length() < Byte.MAX_VALUE).mapToLong(dictionaryWriter::get).toArray(); + private long[] getOrInsertWordIds(List words) { + return words.stream() + .filter(w -> w.length() < Byte.MAX_VALUE) + .mapToLong(keywordLexicon::getOrInsert) + .toArray(); } private Object search(Request request, Response response) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index 40c733e2..b3b4d45e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -4,17 +4,19 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; import lombok.SneakyThrows; +import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; +import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.index.reader.SearchIndex; import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,13 +38,13 @@ public class IndexServicesFactory { private final Logger logger = LoggerFactory.getLogger(getClass()); private final PartitionedDataFile writerIndexFile; - private final RootDataFile writerDictionaryFile; + private final RootDataFile keywordLexiconFile; private final PartitionedDataFile preconverterOutputFile; private final DoublePartitionedDataFile indexReadWordsFile; private final DoublePartitionedDataFile indexReadUrlsFile; private final DoublePartitionedDataFile indexWriteWordsFile; private final DoublePartitionedDataFile indexWriteUrlsFile; - private volatile static DictionaryWriter dictionaryWriter; + private volatile static KeywordLexicon keywordLexicon; private final Long dictionaryHashMapSize; private final SearchIndexPartitioner partitioner; @@ -53,7 +55,7 @@ public class IndexServicesFactory { @Named("partition-root-slow-tmp") Path partitionRootSlowTmp, @Named("partition-root-fast") Path partitionRootFast, @Named("edge-writer-page-index-file") String writerIndexFile, - @Named("edge-writer-dictionary-file") String writerDictionaryFile, + @Named("edge-writer-dictionary-file") String keywordLexiconFile, @Named("edge-index-read-words-file") String indexReadWordsFile, @Named("edge-index-read-urls-file") String indexReadUrlsFile, @Named("edge-index-write-words-file") String indexWriteWordsFile, @@ -68,7 +70,7 @@ public class IndexServicesFactory { this.domainBlacklist = domainBlacklist; this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, writerIndexFile); - this.writerDictionaryFile = new RootDataFile(partitionRootSlow, writerDictionaryFile); + this.keywordLexiconFile = new RootDataFile(partitionRootSlow, keywordLexiconFile); this.indexReadWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadWordsFile); this.indexReadUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadUrlsFile); this.indexWriteWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteWordsFile); @@ -78,19 +80,22 @@ public class IndexServicesFactory { } public SearchIndexJournalWriterImpl getIndexWriter(int idx) { - return new SearchIndexJournalWriterImpl(getDictionaryWriter(), writerIndexFile.get(idx)); - } - - public DictionaryWriter getDictionaryWriter() { - if (dictionaryWriter == null) { - dictionaryWriter = new DictionaryWriter(writerDictionaryFile.get(), dictionaryHashMapSize, true); - } - return dictionaryWriter; + return new SearchIndexJournalWriterImpl(getKeywordLexicon(), writerIndexFile.get(idx)); } @SneakyThrows - public DictionaryReader getDictionaryReader() { - return new DictionaryReader(getDictionaryWriter()); + public KeywordLexicon getKeywordLexicon() { + if (keywordLexicon == null) { + final var journal = new KeywordLexiconJournal(keywordLexiconFile.get()); + keywordLexicon = new KeywordLexicon(journal, + new DictionaryHashMap(dictionaryHashMapSize)); + } + return keywordLexicon; + } + + @SneakyThrows + public KeywordLexiconReadOnlyView getDictionaryReader() { + return new KeywordLexiconReadOnlyView(getKeywordLexicon()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java index afa319f4..adce8747 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java @@ -1,24 +1,26 @@ package nu.marginalia.wmsa.edge.index.conversion; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter; +import nu.marginalia.util.RandomWriteFunnel; import nu.marginalia.util.btree.BTreeWriter; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.util.RandomWriteFunnel; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable; +import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; -import static nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry.MAX_LENGTH; +import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH; public class SearchIndexConverter { public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java deleted file mode 100644 index 906231be..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java +++ /dev/null @@ -1,367 +0,0 @@ -package nu.marginalia.wmsa.edge.index.dictionary; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.google.inject.name.Named; -import io.prometheus.client.Gauge; -import lombok.SneakyThrows; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.util.dict.DictionaryHashMap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; -import java.nio.ByteBuffer; -import java.util.*; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -@Singleton -public class DictionaryWriter implements AutoCloseable { - private final ArrayList commitQueue = new ArrayList<>(10_000); - - private final DictionaryHashMap reverseIndex; - private final boolean prepopulate; - - private final ReadWriteLock memoryLock = new ReentrantReadWriteLock(); - private final ReadWriteLock diskLock = new ReentrantReadWriteLock(); - private final RandomAccessFile raf; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - static final AtomicInteger instances = new AtomicInteger(); - - private final TokenCompressor readOnlyTokenCompressor = new TokenCompressor(this::getReadOnly); - private final TokenCompressor tokenCompressor = new TokenCompressor(this::get); - - private static final Gauge request_time_metrics - = Gauge.build("wmsa_edge_index_dictionary_size", "Dictionary Size") - .register(); - - private volatile boolean running = true; - private final Thread commitToDiskThread; - @SneakyThrows - public long getPos() { - return raf.getFilePointer(); - } - - @SneakyThrows @Inject - public DictionaryWriter( - @Named("edge-writer-dictionary-file") File dictionaryFile, - @Named("edge-dictionary-hash-map-size") Long hashMapSize, - boolean prepopulate) { - logger.info("Creating dictionary writer"); - raf = new RandomAccessFile(dictionaryFile, "rw"); - reverseIndex = new DictionaryHashMap(hashMapSize); - this.prepopulate = prepopulate; - - Lock writeLock = diskLock.writeLock(); - try { - writeLock.lock(); - loadFile(dictionaryFile); - } - finally { - writeLock.unlock(); - } - - commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread"); - commitToDiskThread.start(); - - Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk)); - - if (!instances.compareAndSet(0, 1)) { - logger.error("MULTIPLE WRITER INSTANCES!"); - } - logger.info("Done creating dictionary writer"); - } - - - public void commitToDiskRunner() { - while (running) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - - commitToDisk(); - } - } - - public void prepare() { - if (!prepopulate) - return; - - try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/word-frequency"), - "Could not load word frequency table"); - var br = new BufferedReader(new InputStreamReader(resource)) - ) { - for (;;) { - var line = br.readLine(); - if (line == null) { - break; - } - if (WordPatterns.wordPredicateEither.test(line)) { - get(line); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - - } - @SneakyThrows - private void loadFile(File dictionaryFile) { - if (!dictionaryFile.exists()) { - logger.info("File {} does not exist, can't load", dictionaryFile); - return; - } - - logger.info("Reading {}", dictionaryFile); - - long pos; - if (raf.length() < 8) { - pos = 8; - raf.writeLong(pos); - } - else { - pos = raf.readLong(); - } - - logger.info("Length {} ({})", pos, raf.length()); - if (pos == 8) { - logger.info("Empty DB, prepopulating"); - prepare(); - } - - ByteBuffer buffer = ByteBuffer.allocateDirect(8192); - - var channel = raf.getChannel(); - - long cp = channel.position(); - int debugNext = 0; - try { - buffer.limit(0); - long loaded = 0; - - while (cp < pos || buffer.hasRemaining()) { - if (buffer.limit() - buffer.position() < 4) { - buffer.compact(); - - long rb = channel.read(buffer); - if (rb <= 0) { - break; - } - cp += rb; - - buffer.flip(); - } - - int len = buffer.get(); - if (debugNext > 0) { - logger.warn("NextLen: {} ({})", len, (char) len); - } - while (buffer.limit() - buffer.position() < len) { - buffer.compact(); - int rb = channel.read(buffer); - if (rb <= 0) break; - cp += rb; - buffer.flip(); - } - - if (buffer.limit() < len) { - - logger.warn("Partial write at end-of-file!"); - - if (cp >= pos) { - logger.info("... but it's ok"); - } - break; - } - - boolean negativeLen = false; - if (len < 0) { - len = (len&0xFF); - negativeLen = true; - - } - - byte[] data = new byte[len]; - buffer.get(data); - if ((++loaded % 10_000_000) == 0L) { - logger.info("Loaded {} million items", loaded/1_000_000); - } - - if (debugNext > 0) { - logger.warn("Next word {}", new String(data)); - if (--debugNext == 0) { - logger.info(" "); - } - } - if (negativeLen) { - logger.warn("Negative length of word {} {}@{}", len, new String(data), reverseIndex.size()); - debugNext = 10; - } - -// if (reverseIndex.get(data) != DictionaryHashMap.NO_VALUE) { -// logger.error("Duplicate insert"); -// } - reverseIndex.put(data, reverseIndex.size()); - } - } - catch (Exception ex) { - logger.error("IO Exception", ex); - } - - raf.seek(pos); - request_time_metrics.set(reverseIndex.size()); - - logger.info("Initial loading done, dictionary size {}", reverseIndex.size()); - } - - private final ByteBuffer commitBuffer = ByteBuffer.allocateDirect(4096); - public volatile boolean noCommit = false; - @SneakyThrows - public void commitToDisk() { - if (noCommit) return; - - if (!raf.getChannel().isOpen()) { - logger.error("commitToDisk() with closed channel! Cannot commit!"); - return; - } - - Lock memLock = memoryLock.readLock(); - List data; - try { - memLock.lock(); - if (commitQueue.isEmpty()) - return; - data = new ArrayList<>(commitQueue); - commitQueue.clear(); - } - finally { - memLock.unlock(); - } - - var channel = raf.getChannel(); - commitBuffer.clear(); - - Lock writeLock = diskLock.writeLock(); - // Only acquire memory lock if there's a risk of backpressure - if (data.size() < 1000) { - memLock = null; - } - - try { - if (memLock != null) memLock.lock(); - writeLock.lock(); - - long start = System.currentTimeMillis(); - int ct = data.size(); - - for (byte[] item : data) { - commitBuffer.clear(); - commitBuffer.put((byte) item.length); - commitBuffer.put(item); - commitBuffer.flip(); - - while (commitBuffer.position() < commitBuffer.limit()) - channel.write(commitBuffer, channel.size()); - } - - long pos = channel.size(); - commitBuffer.clear(); - commitBuffer.putLong(pos); - commitBuffer.flip(); - channel.write(commitBuffer, 0); - - channel.force(false); - - logger.debug("Comitted {} items in {} ms", ct, System.currentTimeMillis() - start); - } - catch (Exception ex) { - logger.error("Error during dictionary commit!!!", ex); - } - finally { - writeLock.unlock(); - if (memLock != null) { - memLock.unlock(); - } - } - } - - public int get(String macroWord) { - byte[] word = tokenCompressor.getWordBytes(macroWord); - - Lock lock = memoryLock.readLock(); - try { - lock.lock(); - int idx = reverseIndex.get(word); - if (idx >= 0) { - return idx; - } - } - finally { - lock.unlock(); - } - - lock = memoryLock.writeLock(); - try { - lock.lock(); - int idx = reverseIndex.get(word); - if (idx >= 0) { - return idx; - } - - if (!noCommit) { - commitQueue.add(word); - } - - idx = reverseIndex.size(); - - reverseIndex.put(word, idx); - - request_time_metrics.set(reverseIndex.size()); - - return idx; - } - finally { - - lock.unlock(); - } - } - - public int getReadOnly(String word) { - var bytes = readOnlyTokenCompressor.getWordBytes(word); - if (bytes.length == 0) { - return DictionaryHashMap.NO_VALUE; - } - return reverseIndex.get(bytes); - } - - public int size() { - Lock lock = memoryLock.readLock(); - try { - lock.lock(); - return reverseIndex.size(); - } - finally { - lock.unlock(); - } - } - - @Override - public void close() throws Exception { - logger.warn("Closing DictionaryWriter"); - - running = false; - commitToDiskThread.join(); - commitToDisk(); - - raf.close(); - } - -} - diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java deleted file mode 100644 index 5a3d73ab..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java +++ /dev/null @@ -1,83 +0,0 @@ -package nu.marginalia.wmsa.edge.index.dictionary; - -import nu.marginalia.util.ByteFolder; -import nu.marginalia.util.dict.DictionaryHashMap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Arrays; -import java.util.function.Predicate; -import java.util.function.ToIntFunction; -import java.util.regex.Pattern; - -public class TokenCompressor { - private final ToIntFunction mapper; - private final ByteFolder folder = new ByteFolder(); - public static final byte[] EMPTY = new byte[0]; - - private static final Logger logger = LoggerFactory.getLogger(TokenCompressor.class); - - private static final Predicate intPatternMatcher = Pattern.compile("[1-9][0-9]{1,8}").asMatchPredicate(); - - - public TokenCompressor(ToIntFunction mapper) { - this.mapper = mapper; - } - final char[] separators = new char[] { '_', '-', '.', '/' }; - public synchronized byte[] getWordBytes(String macroWord) { - int ui = -1; - - for (char c : separators) { - int ui2 = macroWord.indexOf(c); - if (ui < 0) ui = ui2; - else if (ui2 >= 0) ui = Math.min(ui, ui2); - } - - if (ui <= 0 || ui >= macroWord.length()-1) { - return getByteRepresentation(macroWord); - } - - String car = macroWord.substring(0, ui); - String cdr = macroWord.substring(ui+1); - - int carId = mapper.applyAsInt(car); - int cdrId = mapper.applyAsInt(cdr); - - if (carId == DictionaryHashMap.NO_VALUE || cdrId == DictionaryHashMap.NO_VALUE) { - return EMPTY; - } - - return folder.foldBytes(carId, cdrId); - } - - private byte[] getByteRepresentation(String word) { - if (intPatternMatcher.test(word)) { - long val = Long.parseLong(word); - if (val < 0x100) { - return new byte[] { 'A', (byte) (val & 0xFF)}; - } - else if (val < 0x10000) { - return new byte[] { 'B', (byte)((val & 0xFF00)>>8), (byte) (val & 0xFF)}; - } - else if (val < 0x1000000) { - return new byte[] { 'C', (byte)((val & 0xFF0000)>>12), (byte)((val & 0xFF00)>>8), (byte) (val & 0xFF)}; - } - else if (val < 0x100000000L) { - return new byte[] { 'D', (byte)((val & 0xFF0000)>>16), (byte)((val & 0xFF0000)>>12), (byte)((val & 0xFF00)>>8), (byte) (val & 0xFF)}; - } - } - - var bytes = word.getBytes(); - for (int i = 0; i < bytes.length; i++) { - if (bytes[i] < 32 && (bytes[i] & 0x80) == 0) { - logger.error("Bad byte in {} -> {} ({})", word, bytes[i], (char) bytes[i]); - bytes[i] = '?'; - } - } - if (bytes.length >= Byte.MAX_VALUE) { - return Arrays.copyOf(bytes, Byte.MAX_VALUE); - } - return bytes; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java index 0e11646a..94ebeacf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java @@ -3,13 +3,16 @@ package nu.marginalia.wmsa.edge.index.journal; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLongSlice; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalFileHeader; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import org.jetbrains.annotations.NotNull; import java.nio.ByteBuffer; import java.util.Iterator; -import static nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; +import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; public class SearchIndexJournalReader implements Iterable { public static final long FILE_HEADER_SIZE_LONGS = 2; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java index 4567a428..bad8d4e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java @@ -1,5 +1,8 @@ package nu.marginalia.wmsa.edge.index.journal; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; + public interface SearchIndexJournalWriter { void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java index f5ba8b31..23c4b481 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java @@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index.journal; import io.reactivex.rxjava3.disposables.Disposable; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,7 +18,7 @@ import java.nio.channels.FileChannel; import java.util.concurrent.TimeUnit; public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { - private final DictionaryWriter dictionaryWriter; + private final KeywordLexicon dictionaryWriter; private final Logger logger = LoggerFactory.getLogger(getClass()); private final Disposable writerTask; @@ -28,7 +30,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { private long pos; @SneakyThrows - public SearchIndexJournalWriterImpl(DictionaryWriter dictionaryWriter, File indexFile) { + public SearchIndexJournalWriterImpl(KeywordLexicon dictionaryWriter, File indexFile) { this.dictionaryWriter = dictionaryWriter; initializeIndexFile(indexFile); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java index 493eea40..c370ecd0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.journal; +package nu.marginalia.wmsa.edge.index.journal.model; import java.nio.ByteBuffer; import java.util.Arrays; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntryHeader.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntryHeader.java index f635b1d4..745a1a21 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntryHeader.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.journal; +package nu.marginalia.wmsa.edge.index.journal.model; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeDomain; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalFileHeader.java similarity index 59% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalFileHeader.java index 49ac5009..62fea842 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalFileHeader.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.journal; +package nu.marginalia.wmsa.edge.index.journal.model; public record SearchIndexJournalFileHeader(long fileSize, long wordCount) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java new file mode 100644 index 00000000..6485f381 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java @@ -0,0 +1,117 @@ +package nu.marginalia.wmsa.edge.index.lexicon; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import io.prometheus.client.Gauge; +import lombok.SneakyThrows; +import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +public class KeywordLexicon implements AutoCloseable { + private final DictionaryHashMap reverseIndex; + + private final ReadWriteLock memoryLock = new ReentrantReadWriteLock(); + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private static final AtomicInteger instances = new AtomicInteger(); + private final HashFunction hashFunction = Hashing.murmur3_128(); + + private static final Gauge request_time_metrics + = Gauge.build("wmsa_edge_index_dictionary_size", "Dictionary Size") + .register(); + private final KeywordLexiconJournal journal; + + @SneakyThrows + public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryHashMap reverseIndexHashMap) { + + journal = keywordLexiconJournal; + reverseIndex = reverseIndexHashMap; + + logger.info("Creating dictionary writer"); + + if (!instances.compareAndSet(0, 1)) { + logger.error("MULTIPLE WRITER INSTANCES!"); + } + + journal.loadFile(this::loadJournalEntry); + + logger.info("Done creating dictionary writer"); + } + + private void loadJournalEntry(byte[] bytes) { + final long key = hashFunction.hashBytes(bytes).asLong(); + reverseIndex.put(key); + } + + @SneakyThrows + public int getOrInsert(String macroWord) { + final long key = hashFunction.hashBytes(macroWord.getBytes()).asLong(); + + int idx = getReadOnly(key); + if (idx >= 0) + return idx; + + Lock lock = memoryLock.writeLock(); + try { + lock.lock(); + + // Check again to prevent race condition + if ((idx = reverseIndex.get(key)) >= 0) + return idx; + + journal.enqueue(macroWord); + idx = reverseIndex.put(key); + request_time_metrics.set(reverseIndex.size()); + + return idx; + } + finally { + lock.unlock(); + } + } + + public int getReadOnly(String word) { + return getReadOnly(hashFunction.hashBytes(word.getBytes()).asLong()); + } + + public int getReadOnly(long hashedKey) { + Lock lock = memoryLock.readLock(); + try { + lock.lock(); + return reverseIndex.get(hashedKey); + } + finally { + lock.unlock(); + } + } + + public int size() { + Lock lock = memoryLock.readLock(); + try { + lock.lock(); + return reverseIndex.size(); + } + finally { + lock.unlock(); + } + } + + @Override + public void close() throws Exception { + logger.warn("Closing DictionaryWriter"); + + journal.close(); + } + + public void commitToDisk() { + journal.commitToDisk(); + } +} + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexiconReadOnlyView.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexiconReadOnlyView.java index ca10c000..485bb423 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexiconReadOnlyView.java @@ -1,21 +1,18 @@ -package nu.marginalia.wmsa.edge.index.dictionary; +package nu.marginalia.wmsa.edge.index.lexicon; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; -import com.google.inject.Inject; -import com.google.inject.Singleton; import lombok.SneakyThrows; import java.util.concurrent.TimeUnit; -@Singleton -public class DictionaryReader { - private final DictionaryWriter writer; +public class KeywordLexiconReadOnlyView { + private final KeywordLexicon writer; private final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000).expireAfterAccess(60, TimeUnit.SECONDS).build(); - @SneakyThrows @Inject - public DictionaryReader(DictionaryWriter writer) { + @SneakyThrows + public KeywordLexiconReadOnlyView(KeywordLexicon writer) { this.writer = writer; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java new file mode 100644 index 00000000..02d50862 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java @@ -0,0 +1,69 @@ +package nu.marginalia.wmsa.edge.index.lexicon.journal; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.function.Consumer; + +public class KeywordLexiconJournal { + + private static final boolean noCommit = Boolean.getBoolean("DictionaryJournal.noCommit"); + + private final KeywordLexiconJournalCommitQueue commitQueue; + private final KeywordLexiconJournalFile journalFile; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final Thread commitToDiskThread; + + private volatile boolean running = true; + + public KeywordLexiconJournal(File file) throws IOException { + commitQueue = new KeywordLexiconJournalCommitQueue(); + journalFile = new KeywordLexiconJournalFile(file); + + commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread"); + commitToDiskThread.start(); + + Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk)); + } + + public void enqueue(String word) throws InterruptedException { + commitQueue.enqueue(word); + } + + + public void commitToDiskRunner() { + if (noCommit) return; + + while (running) { + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + commitToDisk(); + } + } + + public void commitToDisk() { + List entries = commitQueue.getQueuedEntries(); + + journalFile.writeEntriesToJournal(entries); + } + + public void close() throws Exception { + logger.info("Closing Journal"); + running = false; + commitToDiskThread.join(); + commitToDisk(); + + journalFile.close(); + } + + public void loadFile(Consumer loadJournalEntry) throws IOException { + journalFile.loadFile(loadJournalEntry); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java new file mode 100644 index 00000000..6baef0e1 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java @@ -0,0 +1,41 @@ +package nu.marginalia.wmsa.edge.index.lexicon.journal; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class KeywordLexiconJournalCommitQueue { + private final ArrayList commitQueue = new ArrayList<>(10_000); + private final Logger logger = LoggerFactory.getLogger(getClass()); + private static final long BACK_PRESSURE_LIMIT = 25_000; + + public synchronized void enqueue(String word) throws InterruptedException { + for (int queueSize = commitQueue.size(); + queueSize >= BACK_PRESSURE_LIMIT; + queueSize = commitQueue.size()) + { + wait(); + } + + commitQueue.add(word); + } + + + public synchronized List getQueuedEntries() { + if (commitQueue.isEmpty()) + return Collections.emptyList(); + var data = new ArrayList<>(commitQueue); + commitQueue.clear(); + + notifyAll(); + + if (data.size() > BACK_PRESSURE_LIMIT) { + logger.warn("Dictionary Journal Backpressure: {}", data.size()); + } + + return data; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java new file mode 100644 index 00000000..a97eee6c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java @@ -0,0 +1,157 @@ +package nu.marginalia.wmsa.edge.index.lexicon.journal; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.List; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.Consumer; + +public class KeywordLexiconJournalFile { + private final RandomAccessFile journalFileRAF; + private final File journalFile; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final ReadWriteLock diskLock = new ReentrantReadWriteLock(); + + + public KeywordLexiconJournalFile(File journalFile) throws IOException { + this.journalFileRAF = new RandomAccessFile(journalFile, "rw");; + this.journalFile = journalFile; + } + + public void loadFile(Consumer acceptEntry) throws IOException { + if (!journalFile.exists()) { + logger.info("File {} does not exist, can't load", journalFile); + return; + } + + logger.info("Reading {}", journalFile); + + long pos; + if (journalFileRAF.length() < 8) { + pos = 8; + journalFileRAF.writeLong(pos); + } + else { + pos = journalFileRAF.readLong(); + } + + logger.info("Length {} ({})", pos, journalFileRAF.length()); + if (pos == 8) { + logger.info("Empty DB"); + } + + ByteBuffer buffer = ByteBuffer.allocateDirect(8192); + + var channel = journalFileRAF.getChannel(); + + long cp = channel.position(); + try { + buffer.limit(0); + long loaded = 0; + + while (cp < pos || buffer.hasRemaining()) { + if (buffer.limit() - buffer.position() < 4) { + buffer.compact(); + + long rb = channel.read(buffer); + if (rb <= 0) { + break; + } + cp += rb; + buffer.flip(); + } + + int len = buffer.get(); + while (buffer.limit() - buffer.position() < len) { + buffer.compact(); + int rb = channel.read(buffer); + if (rb <= 0) break; + cp += rb; + buffer.flip(); + } + + if (buffer.limit() < len) { + logger.warn("Partial write at end-of-file!"); + + if (cp >= pos) { + logger.info("... but it's ok"); + } + break; + } + + byte[] data = new byte[len]; + buffer.get(data); + if ((++loaded % 10_000_000) == 0L) { + logger.info("Loaded {} million items", loaded/1_000_000); + } + + acceptEntry.accept(data); + } + } + catch (Exception ex) { + logger.error("IO Exception", ex); + } + + journalFileRAF.seek(pos); + } + + + private final ByteBuffer writeBuffer = ByteBuffer.allocateDirect(4096); + + public void writeEntriesToJournal(List data) { + if (data.isEmpty()) + return; + + final FileChannel channel = journalFileRAF.getChannel(); + + if (!channel.isOpen()) { + throw new IllegalStateException("commitToDisk() with closed channel! Cannot commit!"); + } + + Lock writeLock = diskLock.writeLock(); + try { + writeLock.lock(); + + long start = System.currentTimeMillis(); + int ct = data.size(); + + for (String item : data) { + writeBuffer.clear(); + writeBuffer.put((byte) item.length()); + writeBuffer.put(item.getBytes()); + writeBuffer.flip(); + + while (writeBuffer.position() < writeBuffer.limit()) + channel.write(writeBuffer, channel.size()); + } + + writeBuffer.clear(); + writeBuffer.putLong(channel.size()); + writeBuffer.flip(); + channel.write(writeBuffer, 0); + + channel.force(false); + + logger.debug("Comitted {} items in {} ms", ct, System.currentTimeMillis() - start); + } + catch (Exception ex) { + logger.error("Error during dictionary commit!!!", ex); + } + finally { + writeLock.unlock(); + } + } + + public void close() throws IOException { + journalFileRAF.close(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java index 01ad1e20..f9e2bfac 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java @@ -3,11 +3,11 @@ package nu.marginalia.wmsa.edge.index.reader; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -29,7 +29,7 @@ public class SearchIndexes { private final SearchIndexJournalWriterImpl primaryIndexWriter; private final SearchIndexJournalWriterImpl secondaryIndexWriter; - private DictionaryReader dictionaryReader = null; + private KeywordLexiconReadOnlyView keywordLexiconReadOnlyView = null; @Inject public SearchIndexes(IndexServicesFactory servicesFactory, SearchIndexPartitioner partitioner) { @@ -105,8 +105,8 @@ public class SearchIndexes { } @Nullable - public DictionaryReader getDictionaryReader() { - return dictionaryReader; + public KeywordLexiconReadOnlyView getDictionaryReader() { + return keywordLexiconReadOnlyView; } @@ -127,7 +127,7 @@ public class SearchIndexes { } logger.info("Initializing dictionary reader"); - dictionaryReader = servicesFactory.getDictionaryReader(); + keywordLexiconReadOnlyView = servicesFactory.getDictionaryReader(); } finally { opsLock.unlock(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java index 961d8304..48ee7c83 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java @@ -1,15 +1,18 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; +import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; +import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import java.io.*; +import java.io.File; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -51,11 +54,16 @@ class DictionaryWriterTest { new SearchIndexPartitioner(null), val -> false); } + + KeywordLexiconJournal createJournal(File f) throws IOException { + return new KeywordLexiconJournal(f); + } + @SneakyThrows @Test @Disabled void test() { - try (var dict = new DictionaryWriter(Path.of("/home/vlofgren/Code/data/dictionary.dat").toFile(), 1L<<16, false)) { + try (var dict = new KeywordLexicon(createJournal(Path.of("/home/vlofgren/Code/data/dictionary.dat").toFile()), new DictionaryHashMap(1L<<16))) { wait(); } } @@ -65,33 +73,33 @@ class DictionaryWriterTest { @Test void getFold() { var path = Files.createTempFile("dict", ".tmp"); - try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) { - dict.get("hic"); - dict.get("hac"); + try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { + dict.getOrInsert("hic"); + dict.getOrInsert("hac"); dict.commitToDisk(); - dict.get("quae"); - dict.get("quis"); - dict.get("quem1"); - dict.get("quem2"); - dict.get("quem3"); - dict.get("quem4"); - dict.get("quem5"); - dict.get("quem6"); - dict.get("quem7"); - dict.get("quem8"); - dict.get("quem9"); - dict.get("quem10"); - dict.get("cuis"); - dict.get("haec_hic"); - dict.get("hoc_hac_cuis"); + dict.getOrInsert("quae"); + dict.getOrInsert("quis"); + dict.getOrInsert("quem1"); + dict.getOrInsert("quem2"); + dict.getOrInsert("quem3"); + dict.getOrInsert("quem4"); + dict.getOrInsert("quem5"); + dict.getOrInsert("quem6"); + dict.getOrInsert("quem7"); + dict.getOrInsert("quem8"); + dict.getOrInsert("quem9"); + dict.getOrInsert("quem10"); + dict.getOrInsert("cuis"); + dict.getOrInsert("haec_hic"); + dict.getOrInsert("hoc_hac_cuis"); dict.commitToDisk(); - assertNotEquals(0, dict.get("hac")); - assertEquals(0, dict.get("hic")); + assertNotEquals(0, dict.getOrInsert("hac")); + assertEquals(0, dict.getOrInsert("hic")); } - try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) { - assertNotEquals(0, dict.get("hoc")); - assertEquals(0, dict.get("hic")); + try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { + assertNotEquals(0, dict.getOrInsert("hoc")); + assertEquals(0, dict.getOrInsert("hic")); } path.toFile().delete(); @@ -101,24 +109,24 @@ class DictionaryWriterTest { @Test void get() { var path = Files.createTempFile("dict", ".tmp"); - try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) { - dict.get("hic"); - dict.get("hac"); - dict.get("haec"); - dict.get("hoc"); + try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { + dict.getOrInsert("hic"); + dict.getOrInsert("hac"); + dict.getOrInsert("haec"); + dict.getOrInsert("hoc"); dict.commitToDisk(); - dict.get("quae"); - dict.get("quis"); - dict.get("quem"); - dict.get("cuis"); + dict.getOrInsert("quae"); + dict.getOrInsert("quis"); + dict.getOrInsert("quem"); + dict.getOrInsert("cuis"); dict.commitToDisk(); - assertNotEquals(0, dict.get("hac")); - assertEquals(0, dict.get("hic")); + assertNotEquals(0, dict.getOrInsert("hac")); + assertEquals(0, dict.getOrInsert("hic")); } - try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) { - assertNotEquals(0, dict.get("hoc")); - assertEquals(0, dict.get("hic")); + try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { + assertNotEquals(0, dict.getOrInsert("hoc")); + assertEquals(0, dict.getOrInsert("hic")); } path.toFile().delete(); @@ -129,25 +137,25 @@ class DictionaryWriterTest { void getDoubleWrite() { var path = Files.createTempFile("dict", ".tmp"); - try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) { + try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { dict.commitToDisk(); } - try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) { - dict.get("hic"); - dict.get("hac"); - dict.get("haec"); - dict.get("hoc"); - dict.get("quae"); - dict.get("quis"); - dict.get("quem"); - dict.get("cuis"); + try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { + dict.getOrInsert("hic"); + dict.getOrInsert("hac"); + dict.getOrInsert("haec"); + dict.getOrInsert("hoc"); + dict.getOrInsert("quae"); + dict.getOrInsert("quis"); + dict.getOrInsert("quem"); + dict.getOrInsert("cuis"); dict.commitToDisk(); - assertNotEquals(0, dict.get("hac")); - assertEquals(0, dict.get("hic")); + assertNotEquals(0, dict.getOrInsert("hac")); + assertEquals(0, dict.getOrInsert("hic")); } - var dict = new DictionaryReader(new DictionaryWriter(path.toFile(), 1L<<16, false)); + var dict = new KeywordLexiconReadOnlyView(new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))); assertNotEquals(0, dict.get("hoc")); assertEquals(0, dict.get("hic")); @@ -160,38 +168,38 @@ class DictionaryWriterTest { void getDoubleWrite2() { var path = Files.createTempFile("dict", ".tmp"); - try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) { - dict.get("hic"); - dict.get("hac"); - dict.get("haec"); - dict.get("hoc"); - dict.get("quae"); - dict.get("quis"); - dict.get("quem"); - dict.get("cuis"); + try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { + dict.getOrInsert("hic"); + dict.getOrInsert("hac"); + dict.getOrInsert("haec"); + dict.getOrInsert("hoc"); + dict.getOrInsert("quae"); + dict.getOrInsert("quis"); + dict.getOrInsert("quem"); + dict.getOrInsert("cuis"); dict.commitToDisk(); - assertNotEquals(0, dict.get("hac")); - assertEquals(0, dict.get("hic")); + assertNotEquals(0, dict.getOrInsert("hac")); + assertEquals(0, dict.getOrInsert("hic")); } - try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) { - dict.get("fe"); - dict.get("fi"); - dict.get("fo"); - dict.get("fum"); + try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { + dict.getOrInsert("fe"); + dict.getOrInsert("fi"); + dict.getOrInsert("fo"); + dict.getOrInsert("fum"); dict.commitToDisk(); - assertNotEquals(0, dict.get("hac")); - assertEquals(0, dict.get("hic")); + assertNotEquals(0, dict.getOrInsert("hac")); + assertEquals(0, dict.getOrInsert("hic")); } - try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) { - dict.get("bip"); - dict.get("bap"); + try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { + dict.getOrInsert("bip"); + dict.getOrInsert("bap"); dict.commitToDisk(); } - var dict = new DictionaryReader(new DictionaryWriter(path.toFile(), 1L<<16, false)); + var dict = new KeywordLexiconReadOnlyView(new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))); assertEquals(0, dict.get("hic")); assertEquals(1, dict.get("hac")); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java index 39a62033..c900f0f6 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java @@ -1,15 +1,15 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; +import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -17,12 +17,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; class SearchIndexJournalWriterTest { - DictionaryWriter dictionaryWriter; + KeywordLexicon keywordLexicon; SearchIndexJournalWriterImpl writer; Path indexFile; @@ -37,11 +36,11 @@ class SearchIndexJournalWriterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - dictionaryWriter = new DictionaryWriter(dictionaryFile.toFile(), 1L<<16, false); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<16)); indexFile = Files.createTempFile("tmp", ".idx"); indexFile.toFile().deleteOnExit(); - writer = new SearchIndexJournalWriterImpl(dictionaryWriter, indexFile.toFile()); + writer = new SearchIndexJournalWriterImpl(keywordLexicon, indexFile.toFile()); wordsFile1 = Files.createTempFile("words1", ".idx"); urlsFile1 = Files.createTempFile("urls1", ".idx"); @@ -50,7 +49,7 @@ class SearchIndexJournalWriterTest { @SneakyThrows @AfterEach void tearDown() { - dictionaryWriter.close(); + keywordLexicon.close(); writer.close(); indexFile.toFile().delete(); dictionaryFile.toFile().delete(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java deleted file mode 100644 index e780ed62..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -import nu.marginalia.wmsa.edge.index.dictionary.TokenCompressor; -import org.junit.jupiter.api.Test; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - -class TokenCompressorTest { - - @Test - public void getWordBytes() { - final Map map = new HashMap<>(); - TokenCompressor tc = new TokenCompressor(word -> { - map.put(word, map.size()); - return map.size()-1; - }); - - System.out.println(Arrays.toString(tc.getWordBytes("308"))); - System.out.println(Arrays.toString(tc.getWordBytes(".308"))); - System.out.println(Arrays.toString(tc.getWordBytes("308."))); - System.out.println(Arrays.toString(tc.getWordBytes("30.8."))); - System.out.println(Arrays.toString(tc.getWordBytes("30..."))); - - map.entrySet().forEach(System.out::println); - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java index cd063ea8..a88715a2 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java @@ -1,21 +1,17 @@ package nu.marginalia.wmsa.edge.index.service.util; -import nu.marginalia.util.dict.DictionaryData; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - class DictionaryDataTest { - @Test - public void testDataBankGrow2() { - var dataBank = new DictionaryData(65535); - for (int i = 0; i < 64; i++) { - String s = "" + i; - int offset = dataBank.add(s.getBytes(), i); - System.out.println(s + " " + offset + " " + new String(dataBank.getBytes(i)) + " " + dataBank.getValue(i)); - - Assertions.assertEquals(s, new String(dataBank.getBytes(i))); - Assertions.assertEquals(i, dataBank.getValue(i)); - } - } +// @Test +// public void testDataBankGrow2() { +// var dataBank = new DictionaryData(65535); +// for (int i = 0; i < 64; i++) { +// String s = "" + i; +// int offset = dataBank.add(s.getBytes(), i); +// System.out.println(s + " " + offset + " " + new String(dataBank.getKey(i)) + " " + dataBank.getValue(i)); +// +// Assertions.assertEquals(s, new String(dataBank.getKey(i))); +// Assertions.assertEquals(i, dataBank.getValue(i)); +// } +// } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java index b9a54237..c39d5c03 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java @@ -1,67 +1,58 @@ package nu.marginalia.wmsa.edge.index.service.util; -import nu.marginalia.util.dict.DictionaryHashMap; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import java.util.HashSet; -import java.util.Set; - -import static org.junit.jupiter.api.Assertions.*; - class DictionaryHashMapTest { - - @Test - public void testDictionaryHashMap() { - var dhm = new DictionaryHashMap(1<<6); - System.out.println(dhm.put("hello".getBytes(), 23)); - System.out.println(dhm.put("hello".getBytes(), 23)); - System.out.println(dhm.put("world".getBytes(), 54)); - assertEquals(23, dhm.get("hello".getBytes())); - assertEquals(54, dhm.get("world".getBytes())); - - } - - @Test - public void testDictionaryHashMapMissing() { - var dhm = new DictionaryHashMap(1<<8); - assertEquals(DictionaryHashMap.NO_VALUE, dhm.get(new byte[] { 1,2,3})); - - } - - @Test - public void randomTest() { - Set strings = new HashSet<>(); - var dhm = new DictionaryHashMap(1<<14); - - for (int i = 0; i < 10000; i++) { - strings.add(Double.toString(Math.random())); - } - - for (String s : strings) { - dhm.put(s.getBytes(), s.hashCode()); - } - - for (String s : strings) { - assertEquals(s.hashCode(), dhm.get(s.getBytes())); - } - - assertEquals(strings.size(), dhm.size()); - } - - @Test - public void fillHerUp2() { - var dhm = new DictionaryHashMap(1<<13); - - try { - for (int i = 0; i < 10000; i++) { - dhm.put(Double.toString(Math.random()).getBytes(), i); - } - Assertions.fail("Expected exception"); - } - catch (IllegalStateException ex) { - ex.printStackTrace(); - } - } +// +// @Test +// public void testDictionaryHashMap() { +// var dhm = new DictionaryHashMap(1<<6); +// System.out.println(dhm.put("hello".getBytes(), 23)); +// System.out.println(dhm.put("hello".getBytes(), 23)); +// System.out.println(dhm.put("world".getBytes(), 54)); +// assertEquals(23, dhm.get("hello".getBytes())); +// assertEquals(54, dhm.get("world".getBytes())); +// +// } +// +// @Test +// public void testDictionaryHashMapMissing() { +// var dhm = new DictionaryHashMap(1<<8); +// assertEquals(DictionaryHashMap.NO_VALUE, dhm.get(new byte[] { 1,2,3})); +// +// } +// +// @Test +// public void randomTest() { +// Set strings = new HashSet<>(); +// var dhm = new DictionaryHashMap(1<<14); +// +// for (int i = 0; i < 10000; i++) { +// strings.add(Double.toString(Math.random())); +// } +// +// for (String s : strings) { +// dhm.put(s.getBytes(), s.hashCode()); +// } +// +// for (String s : strings) { +// assertEquals(s.hashCode(), dhm.get(s.getBytes())); +// } +// +// assertEquals(strings.size(), dhm.size()); +// } +// +// @Test +// public void fillHerUp2() { +// var dhm = new DictionaryHashMap(1<<13); +// +// try { +// for (int i = 0; i < 10000; i++) { +// dhm.put(Double.toString(Math.random()).getBytes(), i); +// } +// Assertions.fail("Expected exception"); +// } +// catch (IllegalStateException ex) { +// ex.printStackTrace(); +// } +// } } \ No newline at end of file From 853108028e5ab53697160e64fcdf7a0dfea8a58f Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 4 Jul 2022 14:47:16 +0200 Subject: [PATCH 14/40] WIP: Selective URL param strings --- .../converting/LinkKeywordExtractorMain.java | 3 +- .../converting/atags/AnchorTextExtractor.java | 16 ++++---- .../edge/converting/loader/SqlLoadUrls.java | 11 +++-- .../processor/logic/LinkParser.java | 41 ++++++++++++++++--- .../crawling/retreival/CrawlerRetreiver.java | 6 +-- .../edge/crawling/retreival/HttpFetcher.java | 4 +- .../StackOverflowPostsReader.java | 2 +- .../wikipedia/WikipediaReader.java | 2 +- .../wmsa/edge/model/EdgeDomain.java | 2 +- .../marginalia/wmsa/edge/model/EdgeUrl.java | 14 +++++-- .../main/resources/sql/edge-crawler-cache.sql | 22 ++++++---- .../wmsa/edge/model/EdgeUrlTest.java | 9 ++++ 12 files changed, 93 insertions(+), 39 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index 792dac6f..156dbdaa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -144,7 +144,8 @@ public class LinkKeywordExtractorMain { try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, - url -> crawledUrls.contains(url.toString().hashCode()), + url -> url.params != null, + //url -> crawledUrls.contains(url.toString().hashCode()), output::write); logger.info("Reading files"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java index c96fd400..c44e7f18 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java @@ -74,9 +74,6 @@ public class AnchorTextExtractor { if (!isInterestingAnchorText(text)) { return; } - if (href.contains("?")) { - return; - } var optLinkUrl = linkParser.parseLink(documentUrl, href); if (optLinkUrl.isEmpty()) return; @@ -92,13 +89,16 @@ public class AnchorTextExtractor { continue; word = word.toLowerCase(); - if (!WordPatterns.filter(word)) + if (!WordPatterns.filter(word)) { continue; + } - if (!linkUrl.domain.equals(documentUrl.domain)) { - if (isNewKeywordForLink(word, linkUrl.toString())) { - linkKeywordConsumer.accept(linkUrl, word); - } + if (linkUrl.domain.equals(documentUrl.domain)) { + continue; + } + + if (isNewKeywordForLink(word, linkUrl.toString())) { + linkKeywordConsumer.accept(linkUrl, word); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index ba9ae43a..04c9735f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -30,6 +30,7 @@ public class SqlLoadUrls { IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN PORT INT, IN PATH VARCHAR(255), + IN PARAM VARCHAR(255), IN PATH_HASH BIGINT ) BEGIN @@ -45,8 +46,8 @@ public class SqlLoadUrls { public void load(LoaderData data, EdgeUrl[] urls) { try (var conn = dataSource.getConnection(); - var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)"); - var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?") + var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)"); + var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?") ) { conn.setAutoCommit(false); @@ -61,7 +62,8 @@ public class SqlLoadUrls { insertCall.setNull(3, Types.INTEGER); } insertCall.setString(4, url.path); - insertCall.setLong(5, hashPath(url.path)); + insertCall.setString(5, url.params); + insertCall.setLong(6, hashPath(url.path)); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -84,8 +86,9 @@ public class SqlLoadUrls { int urlId = rsp.getInt(1); String proto = rsp.getString(2); String path = rsp.getString(3); + String param = rsp.getString(4); - data.addUrl(new EdgeUrl(proto, targetDomain, null, path), urlId); + data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index 0a2bdf45..c14e31cb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -13,9 +13,12 @@ import org.slf4j.LoggerFactory; import java.net.URI; import java.net.URISyntaxException; +import java.util.Arrays; import java.util.List; import java.util.Optional; +import java.util.function.Predicate; import java.util.regex.Pattern; +import java.util.stream.Collectors; public class LinkParser { private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -107,21 +110,30 @@ public class LinkParser { @SneakyThrows private String resolveUrl(EdgeUrl baseUrl, String s) { - s = paramRegex.matcher(s).replaceAll(""); // url looks like http://www.marginalia.nu/ if (isAbsoluteDomain(s)) { return s; } - // url looks like /my-page - if (s.startsWith("/")) { - return baseUrl.withPath(s).toString(); + String[] parts = s.split("\\?", 2); + String path = parts[0]; + String param; + if (parts.length > 1) { + param = queryParamsSanitizer(parts[1]); + } + else { + param = null; } - final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20"); + // url looks like /my-page + if (path.startsWith("/")) { + return baseUrl.withPathAndParam(path, param).toString(); + } - return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString(); + final String partFromNewLink = spaceRegex.matcher(path).replaceAll("%20"); + + return baseUrl.withPathAndParam(relativeNavigation(baseUrl) + partFromNewLink, param).toString(); } // for a relative url that looks like /foo or /foo/bar; return / or /foo @@ -183,4 +195,21 @@ public class LinkParser { return documentUrl; } + + private static final Pattern paramSplitterPattern = Pattern.compile("&"); + private static final Predicate paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate(); + + public static String queryParamsSanitizer(String queryParams) { + if (queryParams == null) { + return null; + } + + var ret = Arrays.stream(paramSplitterPattern.split(queryParams)) + .filter(paramPatternPredicate) + .sorted() + .collect(Collectors.joining("&")); + if (ret.isBlank()) + return null; + return ret; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 2b27ed4d..c275ad6f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -63,7 +63,7 @@ public class CrawlerRetreiver { if (queue.peek() != null) { var fst = queue.peek(); - var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/"); + var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null); if (known.add(root)) queue.addFirst(root); } @@ -110,7 +110,7 @@ public class CrawlerRetreiver { .build()); } - var fetchResult = fetcher.probeDomain(new EdgeUrl(fst.proto, fst.domain, fst.port, "/")); + var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl()); if (!fetchResult.ok()) { logger.debug("Bad status on {}", domain); return Optional.of(createErrorPostFromStatus(fetchResult)); @@ -232,7 +232,7 @@ public class CrawlerRetreiver { } private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { - baseUrl = baseUrl.withPath("/"); + baseUrl = baseUrl.domain.toRootUrl(); for (var link : parsed.select("link[rel=canonical]")) { return linkParser.parseLink(baseUrl, link); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 40728294..53180137 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -109,7 +109,7 @@ public class HttpFetcher { @SneakyThrows public FetchResult probeDomain(EdgeUrl url) { var head = new Request.Builder().head().addHeader("User-agent", userAgent) - .url(new EdgeUrl(url.proto, url.domain, url.port, "/").toString()) + .url(url.domain.toRootUrl().toString()) .build(); var call = client.newCall(head); @@ -293,7 +293,7 @@ public class HttpFetcher { private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { try { - var url = new EdgeUrl(proto, domain, null, "/robots.txt"); + var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); return Optional.of(parseRobotsTxt(fetchContent(url))); } catch (Exception ex) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java index 0fecf63a..88921be1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java @@ -64,7 +64,7 @@ public class StackOverflowPostsReader extends DefaultHandler { } private StackOverflowPost createPost(StackOverflowQuestionData data) { - EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId()); + EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId(), null); StringBuilder body = new StringBuilder(); body.append(data.getQuestion()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java index 12bfec3f..fa5904c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java @@ -37,7 +37,7 @@ public class WikipediaReader { } private EdgeUrl synthesizeUrl(String originalUrl) { - return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl); + return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl, null); } public void join() throws InterruptedException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index d1945c9e..658184c0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -59,7 +59,7 @@ public class EdgeDomain implements WideHashable { public EdgeUrl toRootUrl() { // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http - return new EdgeUrl("http", this, null, "/"); + return new EdgeUrl("http", this, null, "/", null); } public String toString() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index e82d4b7c..b7681951 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -4,6 +4,7 @@ import lombok.Builder; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.Setter; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import java.net.URI; import java.net.URISyntaxException; @@ -15,12 +16,14 @@ public class EdgeUrl implements WideHashable { public final EdgeDomain domain; public final Integer port; public final String path; + public final String params; - public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path) { + public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) { this.proto = proto; this.domain = domain; this.port = port(port, proto); this.path = path; + this.params = params; } public EdgeUrl(String url) throws URISyntaxException { @@ -77,8 +80,10 @@ public class EdgeUrl implements WideHashable { this.path = URI.getPath().isEmpty() ? "/" : URI.getPath(); this.proto = URI.getScheme().toLowerCase(); this.port = port(URI.getPort(), proto); + this.params = LinkParser.queryParamsSanitizer(URI.getQuery()); } + private static Integer port(Integer port, String protocol) { if (null == port || port < 1) { return null; @@ -94,8 +99,9 @@ public class EdgeUrl implements WideHashable { public String toString() { String portPart = port == null ? "" : (":" + port); + String queryPart = params == null ? "" : ("?" + params); - return proto + "://" + domain + portPart + "" + path; + return proto + "://" + domain + portPart + path + queryPart; } public String dir() { @@ -115,7 +121,7 @@ public class EdgeUrl implements WideHashable { return (int) path.chars().filter(c -> c=='/').count(); } - public EdgeUrl withPath(String s) { - return new EdgeUrl(proto, domain, port, s); + public EdgeUrl withPathAndParam(String path, String param) { + return new EdgeUrl(proto, domain, port, path, param); } } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 36ab040a..120a1ce2 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -46,20 +46,23 @@ COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_URL ( ID INT PRIMARY KEY AUTO_INCREMENT, DOMAIN_ID INT NOT NULL, - PROTO ENUM('http','https','gemini') NOT NULL, - PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin, + + PROTO ENUM('http','https','gemini') NOT NULL COLLATE utf8mb4_unicode_ci, + PATH VARCHAR(255) NOT NULL, PORT INT, + PARAM VARCHAR(255), PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", + VISITED BOOLEAN NOT NULL DEFAULT FALSE, - STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', + STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok' COLLATE utf8mb4_unicode_ci, CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH), FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; +COLLATE utf8mb4_bin; CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( ID INT PRIMARY KEY AUTO_INCREMENT, @@ -113,10 +116,13 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK ( CREATE OR REPLACE VIEW EC_URL_VIEW AS SELECT - IF(PORT IS NULL, - CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH), - CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH)) - AS URL, + CONCAT(EC_URL.PROTO, + '://', + EC_DOMAIN.DOMAIN_NAME, + IF(EC_URL.PORT IS NULL, '', CONCAT(':', EC_URL.PORT)), + EC_URL.PATH, + IF(EC_URL.PARAM IS NULL, '', CONCAT('?', EC_URL.PARAM)) + ) AS URL, EC_URL.PATH_HASH AS PATH_HASH, EC_URL.PATH AS PATH, EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME, diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java index dac8dd97..c16f1f08 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java @@ -17,4 +17,13 @@ class EdgeUrlTest { System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\"")); } + + @Test + void testParms() throws URISyntaxException { + System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123")); + System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123")); + System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123")); + System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123")); + System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123")); + } } \ No newline at end of file From f3be865293a792c8cc343e5f32f240b9b7e46733 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 8 Jul 2022 16:36:09 +0200 Subject: [PATCH 15/40] Allow query params for *some* path,param combinations, targeted at allowing the crawl of forums. --- .../converting/LinkKeywordExtractorMain.java | 2 +- .../converting/atags/AnchorTextExtractor.java | 4 +- .../edge/converting/loader/SqlLoadUrls.java | 17 +++++-- .../processor/logic/LinkParser.java | 22 +------- .../processor/logic/LinkProcessor.java | 2 +- .../processor/logic/QueryParams.java | 50 +++++++++++++++++++ .../edge/crawling/blocklist/UrlBlocklist.java | 8 +-- .../crawling/retreival/CrawlerRetreiver.java | 10 ++-- .../edge/index/lexicon/KeywordLexicon.java | 6 +-- .../marginalia/wmsa/edge/model/EdgeUrl.java | 12 ++--- 10 files changed, 84 insertions(+), 49 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index 156dbdaa..99c93740 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -144,7 +144,7 @@ public class LinkKeywordExtractorMain { try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, - url -> url.params != null, + url -> url.param != null, //url -> crawledUrls.contains(url.toString().hashCode()), output::write); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java index c44e7f18..8c5fc6c1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java @@ -138,8 +138,8 @@ public class AnchorTextExtractor { private boolean isNewKeywordForLink(String href, String text) { long hash = 0; - hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong(); - hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong(); + hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong(); + hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).padToLong(); // Remove sign bit because we don't want a negative index in deduplicateHashBitset hash &= 0x7FFF_FFFF_FFFF_FFFFL; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index 04c9735f..d09fac4a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -1,5 +1,6 @@ package nu.marginalia.wmsa.edge.converting.loader; +import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; @@ -62,8 +63,8 @@ public class SqlLoadUrls { insertCall.setNull(3, Types.INTEGER); } insertCall.setString(4, url.path); - insertCall.setString(5, url.params); - insertCall.setLong(6, hashPath(url.path)); + insertCall.setString(5, url.param); + insertCall.setLong(6, hashPath(url.path, url.param)); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -97,7 +98,15 @@ public class SqlLoadUrls { } } - private long hashPath(String path) { - return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong(); + private static final HashFunction murmur3_128 = Hashing.murmur3_128(); + private long hashPath(String path, String queryParam) { + long pathHash = murmur3_128.hashString(path, StandardCharsets.UTF_8).padToLong(); + + if (queryParam == null) { + return pathHash; + } + else { + return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong(); + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index c14e31cb..d58b15bf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -13,12 +13,9 @@ import org.slf4j.LoggerFactory; import java.net.URI; import java.net.URISyntaxException; -import java.util.Arrays; import java.util.List; import java.util.Optional; -import java.util.function.Predicate; import java.util.regex.Pattern; -import java.util.stream.Collectors; public class LinkParser { private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -105,7 +102,6 @@ public class LinkParser { return url; } - private static final Pattern paramRegex = Pattern.compile("\\?.*$"); private static final Pattern spaceRegex = Pattern.compile(" "); @SneakyThrows @@ -120,7 +116,7 @@ public class LinkParser { String path = parts[0]; String param; if (parts.length > 1) { - param = queryParamsSanitizer(parts[1]); + param = QueryParams.queryParamsSanitizer(parts[0], parts[1]); } else { param = null; @@ -196,20 +192,4 @@ public class LinkParser { return documentUrl; } - private static final Pattern paramSplitterPattern = Pattern.compile("&"); - private static final Predicate paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate(); - - public static String queryParamsSanitizer(String queryParams) { - if (queryParams == null) { - return null; - } - - var ret = Arrays.stream(paramSplitterPattern.split(queryParams)) - .filter(paramPatternPredicate) - .sorted() - .collect(Collectors.joining("&")); - if (ret.isBlank()) - return null; - return ret; - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java index 24c9229d..54c47e4c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java @@ -72,7 +72,7 @@ public class LinkProcessor { return false; } - if (urlBlocklist.isForumLink(link)) { + if (urlBlocklist.isMailingListLink(link)) { return false; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java new file mode 100644 index 00000000..ad52e347 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java @@ -0,0 +1,50 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import javax.annotation.Nullable; +import java.util.Arrays; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +public class QueryParams { + + private static final Pattern paramSplitterPattern = Pattern.compile("&"); + + @Nullable + public static String queryParamsSanitizer(String path, @Nullable String queryParams) { + if (queryParams == null) { + return null; + } + + var ret = Arrays.stream(paramSplitterPattern.split(queryParams)) + .filter(param -> QueryParams.isPermittedParam(path, param)) + .sorted() + .collect(Collectors.joining("&")); + + if (ret.isBlank()) + return null; + + return ret; + } + + public static boolean isPermittedParam(String path, String param) { + if (path.endsWith("index.php")) { + if (param.startsWith("showtopic")) + return true; + if (param.startsWith("showforum")) + return true; + } + if (path.endsWith("viewtopic.php")) { + return (param.startsWith("t=") || param.startsWith("p=")); + } + if (path.endsWith("viewforum.php")) { + return param.startsWith("v="); + } + if (path.endsWith("showthread.php")) { + return (param.startsWith("t=") || param.startsWith("p=")); + } + if (path.endsWith("showforum.php")) { + return param.startsWith("v="); + } + return false; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index f81ca0db..b70e4ab0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -33,20 +33,14 @@ public class UrlBlocklist { } } - public boolean isForumLink(EdgeUrl linkUrl) { + public boolean isMailingListLink(EdgeUrl linkUrl) { var path = linkUrl.path; - if (path.startsWith("/forum")) { - return true; - } if (path.startsWith("/lists/")) { return true; } if (path.startsWith("mailinglist")) { return true; } - if (path.contains("phpbb")) { - return true; - } return false; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index c275ad6f..b9fb79c5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -63,7 +63,7 @@ public class CrawlerRetreiver { if (queue.peek() != null) { var fst = queue.peek(); - var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null); + var root = fst.domain.toRootUrl(); if (known.add(root)) queue.addFirst(root); } @@ -121,6 +121,8 @@ public class CrawlerRetreiver { private CrawledDomain crawlDomain() { String ip = findIp(domain); + assert !queue.isEmpty(); + var robotsRules = fetcher.fetchRobotRules(queue.peek().domain); long crawlDelay = robotsRules.getCrawlDelay(); @@ -209,7 +211,7 @@ public class CrawlerRetreiver { linkParser.parseLink(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) - .filter(u -> !urlBlocklist.isForumLink(u)) + .filter(u -> !urlBlocklist.isMailingListLink(u)) .filter(known::add) .ifPresent(queue::addLast); } @@ -217,7 +219,7 @@ public class CrawlerRetreiver { linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) - .filter(u -> !urlBlocklist.isForumLink(u)) + .filter(u -> !urlBlocklist.isMailingListLink(u)) .filter(known::add) .ifPresent(queue::addLast); } @@ -225,7 +227,7 @@ public class CrawlerRetreiver { linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) - .filter(u -> !urlBlocklist.isForumLink(u)) + .filter(u -> !urlBlocklist.isMailingListLink(u)) .filter(known::add) .ifPresent(queue::addLast); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java index 6485f381..8d15f8f3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java @@ -46,13 +46,13 @@ public class KeywordLexicon implements AutoCloseable { } private void loadJournalEntry(byte[] bytes) { - final long key = hashFunction.hashBytes(bytes).asLong(); + final long key = hashFunction.hashBytes(bytes).padToLong(); reverseIndex.put(key); } @SneakyThrows public int getOrInsert(String macroWord) { - final long key = hashFunction.hashBytes(macroWord.getBytes()).asLong(); + final long key = hashFunction.hashBytes(macroWord.getBytes()).padToLong(); int idx = getReadOnly(key); if (idx >= 0) @@ -78,7 +78,7 @@ public class KeywordLexicon implements AutoCloseable { } public int getReadOnly(String word) { - return getReadOnly(hashFunction.hashBytes(word.getBytes()).asLong()); + return getReadOnly(hashFunction.hashBytes(word.getBytes()).padToLong()); } public int getReadOnly(long hashedKey) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index b7681951..123bd95a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -4,7 +4,7 @@ import lombok.Builder; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.Setter; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; +import nu.marginalia.wmsa.edge.converting.processor.logic.QueryParams; import java.net.URI; import java.net.URISyntaxException; @@ -16,14 +16,14 @@ public class EdgeUrl implements WideHashable { public final EdgeDomain domain; public final Integer port; public final String path; - public final String params; + public final String param; - public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) { + public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String param) { this.proto = proto; this.domain = domain; this.port = port(port, proto); this.path = path; - this.params = params; + this.param = param; } public EdgeUrl(String url) throws URISyntaxException { @@ -80,7 +80,7 @@ public class EdgeUrl implements WideHashable { this.path = URI.getPath().isEmpty() ? "/" : URI.getPath(); this.proto = URI.getScheme().toLowerCase(); this.port = port(URI.getPort(), proto); - this.params = LinkParser.queryParamsSanitizer(URI.getQuery()); + this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery()); } @@ -99,7 +99,7 @@ public class EdgeUrl implements WideHashable { public String toString() { String portPart = port == null ? "" : (":" + port); - String queryPart = params == null ? "" : ("?" + params); + String queryPart = param == null ? "" : ("?" + param); return proto + "://" + domain + portPart + path + queryPart; } From 2b83e0d75408b5fb8833628794dba010b12a3eb2 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 8 Jul 2022 16:50:00 +0200 Subject: [PATCH 16/40] Block websites with "acceptable ads", as this seems a strong indicator the domain is either parked or spam. --- .../model/DisqualifiedException.java | 7 +++++- .../converting/processor/AcceptableAds.java | 22 +++++++++++++++++++ .../processor/DocumentProcessor.java | 21 +++++++++++------- 3 files changed, 41 insertions(+), 9 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java index 1c785371..c252f315 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java @@ -12,6 +12,11 @@ public class DisqualifiedException extends Exception { } public enum DisqualificationReason { - LENGTH, CONTENT_TYPE, LANGUAGE, STATUS, QUALITY + LENGTH, + CONTENT_TYPE, + LANGUAGE, + STATUS, + QUALITY, + ACCEPTABLE_ADS } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java new file mode 100644 index 00000000..2814eea7 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java @@ -0,0 +1,22 @@ +package nu.marginalia.wmsa.edge.converting.processor; + +import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; +import org.jsoup.nodes.Document; + + +public class AcceptableAds { + /* Acceptable Ads is an initiative to allow less intrusive ads to punch through adblockers. + * + * In practice, from looking at crawled data, the only sites in the crawled corpus that seem to + * follow this standard are domain squatters and other nuisance sites. + * + */ + + public static boolean hasAcceptableAdsTag(Document parsedDocument) { + return parsedDocument.getElementsByTag("html").hasAttr("data-adblockkey"); + } + + public static boolean hasAcceptableAdsHeader(CrawledDocument document) { + return document.headers.contains("X-Adblock-Key"); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index b205cdea..d6cf2e46 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -3,19 +3,15 @@ package nu.marginalia.wmsa.edge.converting.processor; import com.google.common.hash.HashCode; import com.google.inject.Inject; import com.google.inject.name.Named; +import nu.marginalia.util.language.LanguageFilter; +import nu.marginalia.util.language.processing.DocumentKeywordExtractor; +import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason; import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails; import nu.marginalia.wmsa.edge.converting.processor.logic.*; -import nu.marginalia.wmsa.edge.converting.processor.logic.FeedExtractor; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.util.language.LanguageFilter; -import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; @@ -81,6 +77,10 @@ public class DocumentProcessor { if (ret.state == EdgeUrlState.OK) { + if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) { + throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); + } + if (isAcceptedContentType(crawledDocument)) { var detailsWords = createDetails(crawledDomain, crawledDocument); @@ -128,6 +128,11 @@ public class DocumentProcessor { throws DisqualifiedException, URISyntaxException { var doc = Jsoup.parse(crawledDocument.documentBody); + + if (AcceptableAds.hasAcceptableAdsTag(doc)) { + throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); + } + var dld = sentenceExtractor.extractSentences(doc.clone()); checkDocumentLanguage(dld); From 7dea94d36d5f90883a8982e21450eb3afb161d7d Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 8 Jul 2022 17:25:16 +0200 Subject: [PATCH 17/40] Cleaned up HTML features code a bit. --- .../processor/DocumentProcessor.java | 18 ++++-------------- .../processor/logic/HtmlFeature.java | 18 ++++++++++++------ 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index d6cf2e46..d04415fd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -163,7 +163,6 @@ public class DocumentProcessor { var edgeDomain = url.domain; tagWords.add("format:"+ret.standard.toString().toLowerCase()); - tagWords.add("site:" + edgeDomain.toString().toLowerCase()); if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) { tagWords.add("site:" + edgeDomain.domain.toLowerCase()); @@ -172,18 +171,7 @@ public class DocumentProcessor { tagWords.add("proto:"+url.proto.toLowerCase()); tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase()); - if (ret.features.contains(HtmlFeature.MEDIA)) { - tagWords.add("special:media"); - } - if (ret.features.contains(HtmlFeature.TRACKING)) { - tagWords.add("special:tracking"); - } - if (ret.features.contains(HtmlFeature.AFFILIATE_LINK)) { - tagWords.add("special:affiliate"); - } - if (ret.features.contains(HtmlFeature.COOKIES)) { - tagWords.add("special:cookies"); - } + ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); words.append(IndexBlock.Meta, tagWords); words.append(IndexBlock.Words, tagWords); @@ -201,7 +189,9 @@ public class DocumentProcessor { for (var frame : doc.getElementsByTag("frame")) { linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); } - + for (var frame : doc.getElementsByTag("iframe")) { + linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); + } for (var link : doc.select("link[rel=alternate]")) { feedExtractor .getFeedFromAlternateTag(baseUrl, link) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java index c8a839ac..ff835dc7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java @@ -3,17 +3,23 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import java.util.Collection; public enum HtmlFeature { - MEDIA(0), - JS(1), - AFFILIATE_LINK(2), - TRACKING(3), - COOKIES(4) + MEDIA(0, "special:media"), + JS(1, "special:scripts"), + AFFILIATE_LINK(2, "special:affiliate"), + TRACKING(3, "special:tracking"), + COOKIES(4, "special:cookies") ; public final int bit; + private final String keyword; - HtmlFeature(int bit) { + HtmlFeature(int bit, String keyword) { this.bit = bit; + this.keyword = keyword; + } + + public String getKeyword() { + return keyword; } public static int encode(Collection featuresAll) { From b0c40136caa323ad4b93f760c01edbb73966eab9 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 8 Jul 2022 19:52:12 +0200 Subject: [PATCH 18/40] Cleaned up HTML features code a bit. --- .../processor/logic/HtmlFeature.java | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java index ff835dc7..032315dd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java @@ -3,18 +3,16 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import java.util.Collection; public enum HtmlFeature { - MEDIA(0, "special:media"), - JS(1, "special:scripts"), - AFFILIATE_LINK(2, "special:affiliate"), - TRACKING(3, "special:tracking"), - COOKIES(4, "special:cookies") + MEDIA( "special:media"), + JS("special:scripts"), + AFFILIATE_LINK( "special:affiliate"), + TRACKING("special:tracking"), + COOKIES("special:cookies") ; - public final int bit; private final String keyword; - HtmlFeature(int bit, String keyword) { - this.bit = bit; + HtmlFeature(String keyword) { this.keyword = keyword; } @@ -23,12 +21,14 @@ public enum HtmlFeature { } public static int encode(Collection featuresAll) { - return featuresAll.stream().mapToInt(f -> 1 << f.bit).reduce(0, (l, r) -> (l|r)); + int ret = 0; + for (var feature : featuresAll) { + ret |= (1 << (feature.ordinal())); + } + return ret; } + public static boolean hasFeature(int value, HtmlFeature feature) { - return (value & (1<< feature.bit)) != 0; - } - public static int addFeature(int value, HtmlFeature feature) { - return (value | (1<< feature.bit)); + return (value & (1<< feature.ordinal())) != 0; } } From fed2fa9397b33f0d81934a87bb52738c207377dd Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 11 Jul 2022 23:25:03 +0200 Subject: [PATCH 19/40] Fix tiny NPE in converting --- .../wmsa/edge/converting/ConverterMain.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java index 61ff0b00..973554d2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java @@ -1,18 +1,19 @@ package nu.marginalia.wmsa.edge.converting; -import com.google.gson.*; +import com.google.common.base.Strings; +import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.util.ParallelPipe; import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; -import nu.marginalia.wmsa.edge.crawling.WorkLog; import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; +import nu.marginalia.wmsa.edge.crawling.WorkLog; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; -import nu.marginalia.util.ParallelPipe; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -96,6 +97,10 @@ public class ConverterMain { domainToId.forEach((domain, id) -> { String fileName = idToFileName.get(id); + + if (Strings.isNullOrEmpty(fileName)) + return; + Path dest = plan.getCrawledFilePath(fileName); logger.info("{} - {} - {}", domain, id, dest); From 20970a6161d92b2059e697d29b89f573e26e07d6 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 14 Jul 2022 12:37:06 +0200 Subject: [PATCH 20/40] Make processor more lenient toward quality, accept content-types which specify charset --- .../wmsa/edge/converting/ConverterModule.java | 2 +- .../processor/DocumentProcessor.java | 14 +++++++- .../converting/processor/DomainProcessor.java | 32 +++++++++++++++++-- .../processor/InstructionsCompiler.java | 15 +++++---- .../processor/logic/DocumentValuator.java | 14 +++----- 5 files changed, 57 insertions(+), 20 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java index 4bf6eaea..1177c1a7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java @@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import java.net.URISyntaxException; -import java.nio.file.Path; public class ConverterModule extends AbstractModule { @@ -27,6 +26,7 @@ public class ConverterModule extends AbstractModule { bind(Gson.class).toInstance(createGson()); bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.); + bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.); bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100); bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index d04415fd..618e5efb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -113,7 +113,19 @@ public class DocumentProcessor { } private boolean isAcceptedContentType(CrawledDocument crawledDocument) { - return crawledDocument.contentType != null && acceptedContentTypes.contains(crawledDocument.contentType.toLowerCase()); + if (crawledDocument.contentType == null) { + return false; + } + + var ct = crawledDocument.contentType; + + if (acceptedContentTypes.contains(ct)) + return true; + + if (ct.contains(";")) { + return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';'))); + } + return false; } private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index 4343b0c3..b8b53f9d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -1,21 +1,29 @@ package nu.marginalia.wmsa.edge.converting.processor; import com.google.inject.Inject; +import com.google.inject.name.Named; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; import java.util.ArrayList; import java.util.Collections; +import java.util.List; public class DomainProcessor { private final DocumentProcessor documentProcessor; + private final Double minAvgDocumentQuality; @Inject - public DomainProcessor(DocumentProcessor documentProcessor) { + public DomainProcessor(DocumentProcessor documentProcessor, + @Named("min-avg-document-quality") Double minAvgDocumentQuality + ) { this.documentProcessor = documentProcessor; + this.minAvgDocumentQuality = minAvgDocumentQuality; } public ProcessedDomain process(CrawledDomain crawledDomain) { @@ -37,17 +45,37 @@ public class DomainProcessor { ret.documents.add(processedDoc); } } - } else { ret.documents = Collections.emptyList(); } + double averageQuality = getAverageQuality(ret.documents); + if (averageQuality < minAvgDocumentQuality) { + ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED); + } + ret.state = getState(crawledDomain.crawlerStatus); return ret; } + private double getAverageQuality(List documents) { + int n = 0; + double q = 0.; + for (var doc : documents) { + if (doc.quality().isPresent()) { + n++; + q += doc.quality().getAsDouble(); + } + } + + if (n > 0) { + return q / n; + } + return -5.; + } + private EdgeDomainIndexingState getState(String crawlerStatus) { return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) { case OK -> EdgeDomainIndexingState.ACTIVE; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java index b75de436..07f1705a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java @@ -42,15 +42,16 @@ public class InstructionsCompiler { Set seenUrls = new HashSet<>(documents.size()*4); Set seenDomains = new HashSet<>(documents.size()); - documents.stream().map(doc -> doc.url).forEach(seenUrls::add); - for (var doc : documents) { - if (doc.details == null) continue; - for (var url : doc.details.linksExternal) { - seenDomains.add(url.domain); + seenUrls.add(doc.url); + + if (doc.details != null) { + for (var url : doc.details.linksExternal) { + seenDomains.add(url.domain); + } + seenUrls.addAll(doc.details.linksExternal); + seenUrls.addAll(doc.details.linksInternal); } - seenUrls.addAll(doc.details.linksExternal); - seenUrls.addAll(doc.details.linksInternal); } ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java index 6f015ef6..b0423efa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java @@ -1,8 +1,8 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import crawlercommons.utils.Strings; -import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; import org.jsoup.nodes.Document; @@ -35,7 +35,7 @@ public class DocumentValuator { throw new DisqualifiedException(LENGTH); } - return Math.log(textBodyLength / (double) rawLength)*htmlStandard.scale + return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale + htmlStandard.offset - scriptPenalty - smutCoefficient; @@ -52,17 +52,13 @@ public class DocumentValuator { double scriptPenalty = 0; for (var tag : scriptTags) { - String srcTag = tag.attr("src"); - if (Strings.isBlank(srcTag)) { - scriptPenalty += 1; - } - else if (srcTag.contains("wp-content") || srcTag.contains("wp-includes") || srcTag.contains("jquery")) { + String srcAttr = tag.attr("src"); + if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) { scriptPenalty += 0.49; } - else { + else if (!Strings.isBlank(srcAttr)) { scriptPenalty += 1; } - } return (int)(scriptPenalty + badScript + (scriptText.length())/1000.); } From 661577b456529af56058f3775544a104e23bd38a Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 14 Jul 2022 14:45:31 +0200 Subject: [PATCH 21/40] Add Fossil SCM commits to URL blocklist --- .../marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java | 4 ++-- .../nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index b70e4ab0..a7dce9ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -11,8 +11,8 @@ public class UrlBlocklist { private final List> patterns = new ArrayList<>(); public UrlBlocklist() { - patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); - patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate()); + patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); // git + patterns.add(Pattern.compile(".*/[a-f0-9]{64}(/|$)").asPredicate()); // fossil SCM patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate()); patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate()); patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java index c357f83c..c93e1ffb 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java @@ -6,7 +6,8 @@ import org.junit.jupiter.api.Test; import java.net.URISyntaxException; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; class UrlBlocklistTest { @@ -19,5 +20,6 @@ class UrlBlocklistTest { assertFalse(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/nope/x-a-course-in-algebra.html"))); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/_module/slide/pqPan/library/american-sour-beer-innovative-techniques-for-mixed-fermentations/"))); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://w-m-p.de/images/book/download-firstborn-starcraft-dark-templar-book-1.php"))); + assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://sqlite.org/src/info/6376abec766e9a0785178b1823b5a587e9f1ccbc"))); } } \ No newline at end of file From c71cc3d43a3313bdaf526b6ca536b39d46bb8d72 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 16 Jul 2022 18:58:19 +0200 Subject: [PATCH 22/40] Fix overflow bugs in DictionaryHashMap that only surfaced without small RAM --- .../java/nu/marginalia/util/dict/DictionaryData.java | 9 +++++++-- .../java/nu/marginalia/util/dict/DictionaryHashMap.java | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java index c36c10d2..9aa953dc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java @@ -59,12 +59,14 @@ public class DictionaryData { private final LongBuffer keys; private int size; + private final int capacity; public DictionaryDataBank(int start_idx, int sz) { this.start_idx = start_idx; + this.capacity = sz; - keys = ByteBuffer.allocateDirect(8*sz).asLongBuffer(); + keys = ByteBuffer.allocateDirect(8*capacity).asLongBuffer(); size = 0; } @@ -88,10 +90,13 @@ public class DictionaryData { throw new IndexOutOfBoundsException(idx); } - return keys.get(idx - start_idx) == other; + return keys.get(idx - start_idx) == other; } public int add(long newKey) { + if (size >= capacity) + return -1; + keys.put(size, newKey); return start_idx + size++; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java index 5544545a..1c76b116 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java @@ -66,8 +66,7 @@ public class DictionaryHashMap { logger.debug("Buffer size sanity checked passed"); } - - dictionaryData = new DictionaryData(Math.min(1<<30, Math.max(32, (int)(sizeMemory/4)))); + dictionaryData = new DictionaryData((int)Math.min(1<<27, Math.max(32L, sizeMemory/4))); initializeBuffers(); } From 80b3ac3dd8f30cc2b54c0fb1562906066f923ee2 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 16 Jul 2022 21:19:13 +0200 Subject: [PATCH 23/40] Tweaking the URL block list to exclude git noise better --- .../edge/crawling/blocklist/UrlBlocklist.java | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index a7dce9ed..b8064952 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -4,15 +4,26 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl; import java.util.ArrayList; import java.util.List; +import java.util.Set; import java.util.function.Predicate; import java.util.regex.Pattern; public class UrlBlocklist { private final List> patterns = new ArrayList<>(); + // domains that have a lot of links but we know we don't want to crawl + private final Set badDomains = Set.of("t.co", "facebook.com", + "instagram.com", "youtube.com", + "youtu.be", "amzn.to"); + public UrlBlocklist() { - patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); // git - patterns.add(Pattern.compile(".*/[a-f0-9]{64}(/|$)").asPredicate()); // fossil SCM + // Don't deep-crawl git repos + patterns.add(Pattern.compile("\\.git/.+").asPredicate()); + + // long base64-strings in URLs are typically git hashes or the like, rarely worth crawling + patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate()); + + // link farms &c patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate()); patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate()); patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate()); @@ -22,15 +33,23 @@ public class UrlBlocklist { public boolean isUrlBlocked(EdgeUrl url) { try { + if (badDomains.contains(url.domain.domain)) { + return true; + } + if ("github.com".equals(url.domain.domain)) { return url.path.chars().filter(c -> c == '/').count() > 2; } - return patterns.stream().anyMatch(p -> p.test(url.path)); + for (var p : patterns) { + if (p.test(url.path)) + return true; + } } catch (StackOverflowError ex) { return true; } + return false; } public boolean isMailingListLink(EdgeUrl linkUrl) { @@ -38,12 +57,9 @@ public class UrlBlocklist { if (path.startsWith("/lists/")) { return true; } - if (path.startsWith("mailinglist")) { + if (path.contains("mailinglist")) { return true; } return false; } - - - } From 89cca4dbff4f007969ac65205e011b37c035be1d Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 16 Jul 2022 21:27:04 +0200 Subject: [PATCH 24/40] Better logging for rare parsing exception --- .../marginalia/wmsa/edge/converting/ConvertedDomainReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java index 9e61c682..eca74633 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java @@ -45,7 +45,7 @@ public class ConvertedDomainReader { try { ret.add(gson.fromJson(parts[1], type)); } - catch (JsonParseException ex) { + catch (NullPointerException|JsonParseException ex) { logger.warn("Failed to deserialize {} {}", type.getSimpleName(), StringUtils.abbreviate(parts[1], 255)); logger.warn("Json error", ex); } From c5dbe269f716f780e5cffb87166d72394c34e52c Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sun, 17 Jul 2022 15:17:39 +0200 Subject: [PATCH 25/40] Better logging for URL errors --- .../marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index d09fac4a..a3fd2797 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -53,6 +53,10 @@ public class SqlLoadUrls { { conn.setAutoCommit(false); for (var url : urls) { + if (url.path.length() >= 255) { + logger.warn("Skipping bad URL {}", url); + continue; + } insertCall.setString(1, url.proto); insertCall.setString(2, url.domain.toString()); @@ -68,7 +72,7 @@ public class SqlLoadUrls { insertCall.addBatch(); } var ret = insertCall.executeBatch(); - for (int rv = 0; rv < urls.length; rv++) { + for (int rv = 0; rv < ret.length; rv++) { if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { logger.warn("load({}) -- bad row count {}", urls[rv], ret[rv]); } From f4966cf1f9e6b3a29007d28916c4b33e55492bd2 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sun, 17 Jul 2022 15:18:16 +0200 Subject: [PATCH 26/40] Fix bug in keyword loading when keywords have non-ASCII symbols --- .../lexicon/journal/KeywordLexiconJournalFile.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java index a97eee6c..241ddefb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java @@ -70,7 +70,10 @@ public class KeywordLexiconJournalFile { buffer.flip(); } - int len = buffer.get(); + int len = buffer.get() & 0xFF; + if (len > Byte.MAX_VALUE) { + logger.warn("Found keyword with impossible length {} near {}, likely corruption", len, cp); + } while (buffer.limit() - buffer.position() < len) { buffer.compact(); int rb = channel.read(buffer); @@ -126,8 +129,9 @@ public class KeywordLexiconJournalFile { for (String item : data) { writeBuffer.clear(); - writeBuffer.put((byte) item.length()); - writeBuffer.put(item.getBytes()); + byte[] itemBytes = item.getBytes(); + writeBuffer.put((byte)itemBytes.length); + writeBuffer.put(itemBytes); writeBuffer.flip(); while (writeBuffer.position() < writeBuffer.limit()) From e30a20bb7447f108861d4af766c16b66bd7e1211 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sun, 17 Jul 2022 19:31:49 +0200 Subject: [PATCH 27/40] Fix bug in keyword loading when keywords have non-ASCII symbols, cleaner solution --- .../wmsa/edge/index/EdgeIndexService.java | 2 +- .../edge/index/lexicon/KeywordLexicon.java | 22 ++++++++++++------- .../journal/KeywordLexiconJournal.java | 4 ++-- .../KeywordLexiconJournalCommitQueue.java | 6 ++--- .../journal/KeywordLexiconJournalFile.java | 7 +++--- 5 files changed, 23 insertions(+), 18 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index 96f1fb72..b4915df7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -199,7 +199,7 @@ public class EdgeIndexService extends Service { private long[] getOrInsertWordIds(List words) { return words.stream() - .filter(w -> w.length() < Byte.MAX_VALUE) + .filter(w -> w.getBytes().length < Byte.MAX_VALUE) .mapToLong(keywordLexicon::getOrInsert) .toArray(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java index 8d15f8f3..667ea6b1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java @@ -9,6 +9,7 @@ import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.charset.StandardCharsets; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; @@ -40,19 +41,23 @@ public class KeywordLexicon implements AutoCloseable { logger.error("MULTIPLE WRITER INSTANCES!"); } - journal.loadFile(this::loadJournalEntry); + journal.loadFile(bytes -> reverseIndex.put(hashFunction.hashBytes(bytes).padToLong())); logger.info("Done creating dictionary writer"); } - private void loadJournalEntry(byte[] bytes) { - final long key = hashFunction.hashBytes(bytes).padToLong(); - reverseIndex.put(key); + public int getOrInsert(String macroWord) { + return getOrInsert(macroWord.getBytes(StandardCharsets.UTF_8)); } @SneakyThrows - public int getOrInsert(String macroWord) { - final long key = hashFunction.hashBytes(macroWord.getBytes()).padToLong(); + private int getOrInsert(byte[] bytes) { + if (bytes.length >= Byte.MAX_VALUE) { + logger.warn("getOrInsert({}), illegal length {}", bytes, bytes.length); + return DictionaryHashMap.NO_VALUE; + } + + final long key = hashFunction.hashBytes(bytes).padToLong(); int idx = getReadOnly(key); if (idx >= 0) @@ -66,7 +71,7 @@ public class KeywordLexicon implements AutoCloseable { if ((idx = reverseIndex.get(key)) >= 0) return idx; - journal.enqueue(macroWord); + journal.enqueue(bytes); idx = reverseIndex.put(key); request_time_metrics.set(reverseIndex.size()); @@ -78,7 +83,8 @@ public class KeywordLexicon implements AutoCloseable { } public int getReadOnly(String word) { - return getReadOnly(hashFunction.hashBytes(word.getBytes()).padToLong()); + final byte[] bytes = word.getBytes(StandardCharsets.UTF_8); + return getReadOnly(hashFunction.hashBytes(bytes).padToLong()); } public int getReadOnly(long hashedKey) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java index 02d50862..c226c1e6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java @@ -30,7 +30,7 @@ public class KeywordLexiconJournal { Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk)); } - public void enqueue(String word) throws InterruptedException { + public void enqueue(byte[] word) throws InterruptedException { commitQueue.enqueue(word); } @@ -49,7 +49,7 @@ public class KeywordLexiconJournal { } public void commitToDisk() { - List entries = commitQueue.getQueuedEntries(); + List entries = commitQueue.getQueuedEntries(); journalFile.writeEntriesToJournal(entries); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java index 6baef0e1..67d4043a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java @@ -8,11 +8,11 @@ import java.util.Collections; import java.util.List; public class KeywordLexiconJournalCommitQueue { - private final ArrayList commitQueue = new ArrayList<>(10_000); + private final ArrayList commitQueue = new ArrayList<>(10_000); private final Logger logger = LoggerFactory.getLogger(getClass()); private static final long BACK_PRESSURE_LIMIT = 25_000; - public synchronized void enqueue(String word) throws InterruptedException { + public synchronized void enqueue(byte[] word) throws InterruptedException { for (int queueSize = commitQueue.size(); queueSize >= BACK_PRESSURE_LIMIT; queueSize = commitQueue.size()) @@ -24,7 +24,7 @@ public class KeywordLexiconJournalCommitQueue { } - public synchronized List getQueuedEntries() { + public synchronized List getQueuedEntries() { if (commitQueue.isEmpty()) return Collections.emptyList(); var data = new ArrayList<>(commitQueue); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java index 241ddefb..b68ee1fe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java @@ -110,7 +110,7 @@ public class KeywordLexiconJournalFile { private final ByteBuffer writeBuffer = ByteBuffer.allocateDirect(4096); - public void writeEntriesToJournal(List data) { + public void writeEntriesToJournal(List data) { if (data.isEmpty()) return; @@ -127,10 +127,9 @@ public class KeywordLexiconJournalFile { long start = System.currentTimeMillis(); int ct = data.size(); - for (String item : data) { + for (byte[] itemBytes : data) { writeBuffer.clear(); - byte[] itemBytes = item.getBytes(); - writeBuffer.put((byte)itemBytes.length); + writeBuffer.put((byte) itemBytes.length); writeBuffer.put(itemBytes); writeBuffer.flip(); From e22748e990e2dd46fd46f1fe8077918c23e7d6a8 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sun, 17 Jul 2022 22:08:06 +0200 Subject: [PATCH 28/40] Better error logging for IO errors during conversion from configuration issues. --- .../wmsa/edge/index/conversion/SearchIndexConverter.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java index adce8747..79c47a08 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java @@ -88,6 +88,10 @@ public class SearchIndexConverter { Files.delete(tmpUrlsFile); } + catch (IOException ex) { + logger.error("Failed to convert", ex); + throw ex; + } finally { lock.unlock(); } From 9f7a28cbdbaa967835cf53b5a6b8212dce0fd572 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sun, 17 Jul 2022 22:21:41 +0200 Subject: [PATCH 29/40] Made search service more robust toward the case where Encyclopedia or Assistant is down --- .../wmsa/edge/EdgeSearchE2ETest.java | 7 +++-- .../assistant/client/AssistantClient.java | 30 +++++++++++++++---- .../wmsa/edge/search/EdgeSearchOperator.java | 14 +++++---- .../wmsa/encyclopedia/EncyclopediaClient.java | 8 ++++- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index 08408de2..e04dd71b 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -12,7 +12,10 @@ import org.openqa.selenium.chrome.ChromeOptions; import org.openzim.ZIMTypes.ZIMFile; import org.openzim.ZIMTypes.ZIMReader; import org.slf4j.LoggerFactory; -import org.testcontainers.containers.*; +import org.testcontainers.containers.BindMode; +import org.testcontainers.containers.BrowserWebDriverContainer; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.NginxContainer; import org.testcontainers.containers.output.Slf4jLogConsumer; import org.testcontainers.containers.wait.strategy.Wait; import org.testcontainers.junit.jupiter.Container; @@ -41,8 +44,6 @@ public class EdgeSearchE2ETest extends E2ETestBase { @Container public static GenericContainer assistantContainer = forService(EDGE_ASSISTANT, mariaDB); @Container - public static GenericContainer encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB); - @Container public static GenericContainer indexContainer = forService(EDGE_INDEX, mariaDB); @Container diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java index de0b9313..63f8e255 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java @@ -4,10 +4,10 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; import nu.marginalia.wmsa.client.AbstractDynamicClient; +import nu.marginalia.wmsa.client.exception.RouteNotConfiguredException; import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.assistant.dict.DictionaryResponse; -import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; import org.eclipse.jetty.util.UrlEncoded; import java.util.List; @@ -21,18 +21,38 @@ public class AssistantClient extends AbstractDynamicClient { } public Observable dictionaryLookup(Context ctx, String word) { - return super.get(ctx,"/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class); + try { + return super.get(ctx, "/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } } @SuppressWarnings("unchecked") public Observable> spellCheck(Context ctx, String word) { - return (Observable>) (Object) super.get(ctx,"/spell-check/" + UrlEncoded.encodeString(word), List.class); + try { + return (Observable>) (Object) super.get(ctx, "/spell-check/" + UrlEncoded.encodeString(word), List.class); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } } public Observable unitConversion(Context ctx, String value, String from, String to) { - return super.get(ctx,"/unit-conversion?value="+value + "&from="+from+"&to="+to); + try { + return super.get(ctx, "/unit-conversion?value=" + value + "&from=" + from + "&to=" + to); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } } public Observable evalMath(Context ctx, String expression) { - return super.get(ctx,"/eval-expression?value="+UrlEncoded.encodeString(expression)); + try { + return super.get(ctx, "/eval-expression?value=" + UrlEncoded.encodeString(expression)); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index add46ef4..a6dff7fc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -11,17 +11,19 @@ import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.model.*; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; -import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; import nu.marginalia.wmsa.edge.search.query.QueryFactory; +import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; -import nu.marginalia.wmsa.edge.search.results.SearchResultValuator; -import nu.marginalia.wmsa.edge.search.results.model.AccumulatedQueryResults; import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator; +import nu.marginalia.wmsa.edge.search.results.SearchResultValuator; import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator; +import nu.marginalia.wmsa.edge.search.results.model.AccumulatedQueryResults; import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient; import org.apache.logging.log4j.util.Strings; import org.jetbrains.annotations.NotNull; @@ -251,7 +253,9 @@ public class EdgeSearchOperator { .encyclopediaLookup(ctx, humanQuery.replaceAll("\\s+", "_") .replaceAll("\"", "") - ).subscribeOn(Schedulers.io()); + ) + .onErrorReturn(e -> new WikiArticles()) + .subscribeOn(Schedulers.io()); } private void fetchResultsMulti(Context ctx, EdgeSearchQuery processedQuery, AccumulatedQueryResults queryResults, UrlDeduplicator deduplicator) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java index dd382220..0b7e5491 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.encyclopedia; import io.reactivex.rxjava3.core.Observable; import nu.marginalia.wmsa.client.AbstractDynamicClient; import nu.marginalia.wmsa.client.HttpStatusCode; +import nu.marginalia.wmsa.client.exception.RouteNotConfiguredException; import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; @@ -28,7 +29,12 @@ public class EncyclopediaClient extends AbstractDynamicClient { @CheckReturnValue public Observable encyclopediaLookup(Context ctx, String word) { - return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class); + try { + return super.get(ctx, "/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class); + } + catch (RouteNotConfiguredException ex) { + return Observable.fromSupplier(WikiArticles::new); + } } } From 3d1031f8e4d92ee1e1c5436579a88c9ec42e6340 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 18 Jul 2022 17:13:47 +0200 Subject: [PATCH 30/40] Add lexicon dumping utility --- .../journal/KeywordLexiconJournalFile.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java index b68ee1fe..a9271453 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java @@ -1,5 +1,6 @@ package nu.marginalia.wmsa.edge.index.lexicon.journal; +import lombok.SneakyThrows; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,9 +22,22 @@ public class KeywordLexiconJournalFile { private final ReadWriteLock diskLock = new ReentrantReadWriteLock(); + @SneakyThrows + public static void main(String... args) { + if (args.length != 1) { + System.err.println("Dumps lexicon content to stdout"); + System.err.println("Arguments: filename"); + return; + } + + KeywordLexiconJournalFile lf = new KeywordLexiconJournalFile(new File(args[0])); + lf.loadFile(bytes -> { + System.out.println(new String(bytes)); + }); + } public KeywordLexiconJournalFile(File journalFile) throws IOException { - this.journalFileRAF = new RandomAccessFile(journalFile, "rw");; + this.journalFileRAF = new RandomAccessFile(journalFile, "rw"); this.journalFile = journalFile; } From 15bd54ef9fc7155ca30bedc1e229b2f6734c1ffb Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 18 Jul 2022 17:22:22 +0200 Subject: [PATCH 31/40] Tidy up LoaderMain a bit --- .../wmsa/edge/converting/LoaderMain.java | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java index 6fe88d08..13678fd0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java @@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.converting; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; @@ -25,14 +24,15 @@ import java.util.concurrent.atomic.AtomicInteger; public class LoaderMain { + + private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); + private final Path processDir; private final EdgeCrawlPlan plan; private final ConvertedDomainReader instructionsReader; - private final HikariDataSource dataSource; - - private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); private final LoaderFactory loaderFactory; private final EdgeIndexClient indexClient; + private volatile boolean running = true; final Thread processorThread = new Thread(this::processor, "Processor Thread"); @@ -56,14 +56,12 @@ public class LoaderMain { @Inject public LoaderMain(EdgeCrawlPlan plan, ConvertedDomainReader instructionsReader, - HikariDataSource dataSource, LoaderFactory loaderFactory, EdgeIndexClient indexClient) { this.processDir = plan.process.getDir(); this.plan = plan; this.instructionsReader = instructionsReader; - this.dataSource = dataSource; this.loaderFactory = loaderFactory; this.indexClient = indexClient; @@ -79,7 +77,7 @@ public class LoaderMain { LoaderMain.loadTotal = loadTotal.get(); WorkLog.readLog(logFile, entry -> { - load(entry.path(), entry.cnt()); + load(plan, entry.path(), entry.cnt()); }); running = false; @@ -90,15 +88,9 @@ public class LoaderMain { } private volatile static int loadTotal; - private static final int loaded = 0; - - private void load(String path, int cnt) { - String first = path.substring(0, 2); - String second = path.substring(2, 4); - Path destDir = processDir.resolve(first).resolve(second).resolve(path); - - + private void load(EdgeCrawlPlan plan, String path, int cnt) { + Path destDir = plan.getProcessedFilePath(path); try { var loader = loaderFactory.create(cnt); var instructions = instructionsReader.read(destDir, cnt); @@ -120,7 +112,8 @@ public class LoaderMain { loader.finish(); long loadTime = System.currentTimeMillis() - startTime; taskStats.observe(loadTime); - logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(), loadTotal, path, loader.data.sizeHint, loadTime, taskStats.avgTime()); + logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(), + loadTotal, path, loader.data.sizeHint, loadTime, taskStats.avgTime()); } } From 9ae76a926448e01c3a0844ea00aec8dcbf65209c Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 18 Jul 2022 18:36:39 +0200 Subject: [PATCH 32/40] Retire old and broken gemini support, needs to be re-implemented by having Memex talk to the API service rather than going directly to Search. --- .../wmsa/edge/search/EdgeSearchOperator.java | 2 +- .../wmsa/edge/search/EdgeSearchService.java | 16 ++------------- .../edge/search/command/CommandEvaluator.java | 20 +++++++++---------- .../edge/search/command/ResponseType.java | 5 ----- .../edge/search/command/SearchParameters.java | 2 +- .../search/command/commands/BangCommand.java | 15 -------------- .../command/commands/ConvertCommand.java | 10 +--------- .../command/commands/DefinitionCommand.java | 9 +-------- .../command/commands/SearchCommand.java | 13 ++---------- .../command/commands/SiteSearchCommand.java | 11 +--------- .../templates/edge/conversion-results-gmi.hdb | 12 ----------- .../templates/edge/dictionary-results-gmi.hdb | 17 ---------------- .../templates/edge/search-result-gmi.hdb | 4 ---- .../templates/edge/search-results-gmi.hdb | 19 ------------------ .../templates/edge/site-info-gmi.hdb | 13 ------------ 15 files changed, 19 insertions(+), 149 deletions(-) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/ResponseType.java delete mode 100644 marginalia_nu/src/main/resources/templates/edge/conversion-results-gmi.hdb delete mode 100644 marginalia_nu/src/main/resources/templates/edge/dictionary-results-gmi.hdb delete mode 100644 marginalia_nu/src/main/resources/templates/edge/search-result-gmi.hdb delete mode 100644 marginalia_nu/src/main/resources/templates/edge/search-results-gmi.hdb delete mode 100644 marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index a6dff7fc..d801965d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -89,7 +89,7 @@ public class EdgeSearchOperator { public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params, @Nullable Future eval) { Observable definitions = getWikiArticle(ctx, params.getHumanQuery()); - var processedQuery = queryFactory.createQuery(params); + EdgeSearchQuery processedQuery = queryFactory.createQuery(params); logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ',')); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java index fa2d06e0..42830179 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java @@ -15,7 +15,6 @@ import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.search.command.CommandEvaluator; -import nu.marginalia.wmsa.edge.search.command.ResponseType; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.exceptions.RedirectException; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; @@ -27,7 +26,7 @@ import spark.Spark; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.Optional; import java.util.stream.Collectors; public class EdgeSearchService extends Service { @@ -149,21 +148,10 @@ public class EdgeSearchService extends Service { final String profileStr = Optional.ofNullable(request.queryParams("profile")).orElse("yolo"); final String humanQuery = queryParam.trim(); - final String format = request.queryParams("format"); - ResponseType responseType; - - if ("gmi".equals(format)) { - response.type("text/gemini"); - responseType = ResponseType.GEMINI; - } - else { - responseType = ResponseType.HTML; - } var params = new SearchParameters( EdgeSearchProfile.getSearchProfile(profileStr), - Optional.ofNullable(request.queryParams("js")).orElse("default"), - responseType); + Optional.ofNullable(request.queryParams("js")).orElse("default")); try { return searchCommandEvaulator.eval(ctx, params, humanQuery); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java index 9110969a..f9ae421a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java @@ -9,8 +9,8 @@ import java.util.List; public class CommandEvaluator { - private final List commands = new ArrayList<>(); - private final SearchCommand search; + private final List specialCommands = new ArrayList<>(); + private final SearchCommand defaultCommand; @Inject public CommandEvaluator( @@ -21,17 +21,17 @@ public class CommandEvaluator { BangCommand bang, SearchCommand search ) { - commands.add(browse); - commands.add(convert); - commands.add(define); - commands.add(site); - commands.add(bang); + specialCommands.add(browse); + specialCommands.add(convert); + specialCommands.add(define); + specialCommands.add(site); + specialCommands.add(bang); - this.search = search; + defaultCommand = search; } public Object eval(Context ctx, SearchParameters parameters, String query) { - for (var cmd : commands) { + for (var cmd : specialCommands) { var ret = cmd.process(ctx, parameters, query); if (ret.isPresent()) { return ret.get(); @@ -39,7 +39,7 @@ public class CommandEvaluator { } // Always process the search command last - return search.process(ctx, parameters, query) + return defaultCommand.process(ctx, parameters, query) .orElseThrow(() -> new IllegalStateException("Search Command returned Optional.empty()!") /* This Should Not be Possible™ */ ); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/ResponseType.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/ResponseType.java deleted file mode 100644 index 2ceb53a2..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/ResponseType.java +++ /dev/null @@ -1,5 +0,0 @@ -package nu.marginalia.wmsa.edge.search.command; - -public enum ResponseType { - HTML, GEMINI -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java index 0eaf58b9..dc6ae832 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.search.command; import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; -public record SearchParameters(EdgeSearchProfile profile, String js, ResponseType responseType) { +public record SearchParameters(EdgeSearchProfile profile, String js) { public String profileStr() { return profile.name; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommand.java index c5517246..afb22d1a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommand.java @@ -2,28 +2,15 @@ package nu.marginalia.wmsa.edge.search.command.commands; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.exceptions.RedirectException; -import nu.marginalia.wmsa.edge.search.model.BrowseResultSet; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.io.IOException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; import java.util.Optional; -import java.util.function.Predicate; -import java.util.regex.Pattern; public class BangCommand implements SearchCommandInterface { private final Map bangsToPattern = new HashMap<>(); @@ -39,9 +26,7 @@ public class BangCommand implements SearchCommandInterface { public Optional process(Context ctx, SearchParameters parameters, String query) { for (var entry : bangsToPattern.entrySet()) { - String key = entry.getKey(); matchBangPattern(query, entry.getKey(), entry.getValue()); - } return Optional.empty(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java index ff90ddf8..7f03c0fc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java @@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.search.command.commands; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.search.UnitConversion; -import nu.marginalia.wmsa.edge.search.command.ResponseType; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; @@ -16,15 +15,12 @@ import java.util.Optional; public class ConvertCommand implements SearchCommandInterface { private final UnitConversion unitConversion; private final MustacheRenderer> conversionRenderer; - private final MustacheRenderer> conversionRendererGmi; @Inject public ConvertCommand(UnitConversion unitConversion, RendererFactory rendererFactory) throws IOException { this.unitConversion = unitConversion; conversionRenderer = rendererFactory.renderer("edge/conversion-results"); - conversionRendererGmi = rendererFactory.renderer("edge/conversion-results-gmi"); - } @Override @@ -34,10 +30,6 @@ public class ConvertCommand implements SearchCommandInterface { return Optional.empty(); } - if (parameters.responseType() == ResponseType.GEMINI) { - return Optional.of(conversionRendererGmi.render(Map.of("query", query, "result", conversion.get()))); - } else { - return Optional.of(conversionRenderer.render(Map.of("query", query, "result", conversion.get(), "profile", parameters.profileStr()))); - } + return Optional.of(conversionRenderer.render(Map.of("query", query, "result", conversion.get(), "profile", parameters.profileStr()))); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/DefinitionCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/DefinitionCommand.java index 63c6a981..d166fe30 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/DefinitionCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/DefinitionCommand.java @@ -6,7 +6,6 @@ import lombok.SneakyThrows; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; import nu.marginalia.wmsa.edge.assistant.dict.DictionaryResponse; -import nu.marginalia.wmsa.edge.search.command.ResponseType; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; @@ -24,7 +23,6 @@ public class DefinitionCommand implements SearchCommandInterface { private final Logger logger = LoggerFactory.getLogger(getClass()); private final MustacheRenderer dictionaryRenderer; - private final MustacheRenderer dictionaryRendererGmi; private final AssistantClient assistantClient; @@ -36,7 +34,6 @@ public class DefinitionCommand implements SearchCommandInterface { { dictionaryRenderer = rendererFactory.renderer("edge/dictionary-results"); - dictionaryRendererGmi = rendererFactory.renderer("edge/dictionary-results-gmi"); this.assistantClient = assistantClient; } @@ -48,11 +45,7 @@ public class DefinitionCommand implements SearchCommandInterface { var results = lookupDefinition(ctx, query); - if (parameters.responseType() == ResponseType.GEMINI) { - return Optional.of(dictionaryRendererGmi.render(results, Map.of("query", parameters.profileStr()))); - } else { - return Optional.of(dictionaryRenderer.render(results, Map.of("query", query, "profile", parameters.profileStr()))); - } + return Optional.of(dictionaryRenderer.render(results, Map.of("query", query, "profile", parameters.profileStr()))); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java index f0d7c704..66a4e056 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java @@ -6,7 +6,6 @@ import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; import nu.marginalia.wmsa.edge.search.UnitConversion; -import nu.marginalia.wmsa.edge.search.command.ResponseType; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; @@ -25,7 +24,6 @@ public class SearchCommand implements SearchCommandInterface { private final EdgeSearchOperator searchOperator; private final UnitConversion unitConversion; private final MustacheRenderer searchResultsRenderer; - private final MustacheRenderer searchResultsRendererGmi; @Inject public SearchCommand(EdgeDomainBlacklist blacklist, @@ -39,23 +37,16 @@ public class SearchCommand implements SearchCommandInterface { this.unitConversion = unitConversion; searchResultsRenderer = rendererFactory.renderer("edge/search-results"); - searchResultsRendererGmi = rendererFactory.renderer("edge/search-results-gmi"); } @Override public Optional process(Context ctx, SearchParameters parameters, String query) { @CheckForNull Future eval = unitConversion.tryEval(ctx, query); - var results = searchOperator.doSearch(ctx, new EdgeUserSearchParameters(query, - parameters.profile(), parameters.js()), eval - ); + DecoratedSearchResults results = searchOperator.doSearch(ctx, new EdgeUserSearchParameters(query, parameters.profile(), parameters.js()), eval); results.getResults().removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain))); - if (parameters.responseType() == ResponseType.GEMINI) { - return Optional.of(searchResultsRendererGmi.render(results)); - } else { - return Optional.of(searchResultsRenderer.render(results)); - } + return Optional.of(searchResultsRenderer.render(results)); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java index 193f1a1c..fafcaa4b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java @@ -7,7 +7,6 @@ import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; -import nu.marginalia.wmsa.edge.search.command.ResponseType; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; @@ -34,7 +33,6 @@ public class SiteSearchCommand implements SearchCommandInterface { private final Logger logger = LoggerFactory.getLogger(getClass()); private final MustacheRenderer siteInfoRenderer; - private final MustacheRenderer siteInfoRendererGmi; private final Predicate queryPatternPredicate = Pattern.compile("^site:[.A-Za-z\\-0-9]+$").asPredicate(); @Inject @@ -50,7 +48,6 @@ public class SiteSearchCommand implements SearchCommandInterface { this.domainInformationService = domainInformationService; siteInfoRenderer = rendererFactory.renderer("edge/site-info"); - siteInfoRendererGmi = rendererFactory.renderer("edge/site-info-gmi"); } @Override @@ -73,13 +70,7 @@ public class SiteSearchCommand implements SearchCommandInterface { resultSet = new DecoratedSearchResultSet(Collections.emptyList()); } - if (parameters.responseType() == ResponseType.GEMINI) { - return Optional.of(siteInfoRendererGmi.render(results, Map.of("query", query))); - } else { - return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet.resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString()))); - } - - + return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet.resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString()))); } diff --git a/marginalia_nu/src/main/resources/templates/edge/conversion-results-gmi.hdb b/marginalia_nu/src/main/resources/templates/edge/conversion-results-gmi.hdb deleted file mode 100644 index 89c01d5e..00000000 --- a/marginalia_nu/src/main/resources/templates/edge/conversion-results-gmi.hdb +++ /dev/null @@ -1,12 +0,0 @@ -# Search Engine - -=> /search Search -=> /search-about.gmi About - -{{query}} = {{result}} - -## Warning - -These results use floating point calculations, and may not be accurate -for very large or very small numbers. Do not use for orbital calculations, -thesis projects, or other sensitive work. \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/dictionary-results-gmi.hdb b/marginalia_nu/src/main/resources/templates/edge/dictionary-results-gmi.hdb deleted file mode 100644 index 7de8f20a..00000000 --- a/marginalia_nu/src/main/resources/templates/edge/dictionary-results-gmi.hdb +++ /dev/null @@ -1,17 +0,0 @@ -# Search Engine - -=> /search Search -=> /search-about.gmi About - -## Results for "{{{query}}}" - -{{#each entries}} -({{type}}) - {{definition}} -{{/each}} - -## Legal - -These definitions are from wiktionary, available under GFDL and CC BY-SA 3.0, except for fair use exceptions. - -=> https://en.wiktionary.org/ -=> https://dumps.wikimedia.org/legal.html \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/search-result-gmi.hdb b/marginalia_nu/src/main/resources/templates/edge/search-result-gmi.hdb deleted file mode 100644 index fe20c054..00000000 --- a/marginalia_nu/src/main/resources/templates/edge/search-result-gmi.hdb +++ /dev/null @@ -1,4 +0,0 @@ - -### {{{title}}} -=> {{geminiLink}} - {{{description}}} diff --git a/marginalia_nu/src/main/resources/templates/edge/search-results-gmi.hdb b/marginalia_nu/src/main/resources/templates/edge/search-results-gmi.hdb deleted file mode 100644 index 32319c90..00000000 --- a/marginalia_nu/src/main/resources/templates/edge/search-results-gmi.hdb +++ /dev/null @@ -1,19 +0,0 @@ -# Search Engine - -=> /search Search -=> /search-about.gmi About - -{{#each problems}} -* {{{.}}}{{/each}} - -## Results for "{{{query}}}" - -{{#each results}} - -{{>edge/search-result-gmi}} - -{{/each}} - --- - -=> / To index \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb deleted file mode 100644 index cd8abf67..00000000 --- a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb +++ /dev/null @@ -1,13 +0,0 @@ -# Search Engine - -=> /search Search -=> /search-about.gmi About - -## Results for "{{{query}}}" - -Blacklisted: {{blacklisted}} -Pages Known: {{pagesKnown}} -Pages Indexed: {{pagesKnown}} -Inbound Links: {{inboundLinks}} -Outbound Links: {{outboundLinks}} -Crawl Ranking: {{ranking}}% \ No newline at end of file From e83a7435c682ced3556e854b8d5e788e32418403 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 19 Jul 2022 01:42:17 +0200 Subject: [PATCH 33/40] Raise min document length a tad, we've been getting a bit too much almost empty documents in the index. --- .../nu/marginalia/wmsa/edge/converting/ConverterModule.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java index 1177c1a7..1731f610 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java @@ -27,7 +27,7 @@ public class ConverterModule extends AbstractModule { bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.); bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.); - bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100); + bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250); bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); From 64844e1db2d96967e5d9386f02881b511c07ce97 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 19 Jul 2022 03:01:23 +0200 Subject: [PATCH 34/40] While some might ask, why would the server host IP be available as a search keyword? I only ask you hold my beer as I make it a reality. --- .../wmsa/edge/converting/processor/DocumentProcessor.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 618e5efb..5bc2d274 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -183,6 +183,10 @@ public class DocumentProcessor { tagWords.add("proto:"+url.proto.toLowerCase()); tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase()); + if (domain.ip != null) { + tagWords.add("ip:" + domain.ip.toLowerCase()); // lower case because IPv6 is hexadecimal + } + ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); words.append(IndexBlock.Meta, tagWords); From 825dea839d61bd334dcffa3f4bf231f8e962b9b0 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 19 Jul 2022 04:50:19 +0200 Subject: [PATCH 35/40] Tweaks to keyword extraction --- .../util/language/DocumentDebugger.java | 2 +- .../processing/DocumentKeywordExtractor.java | 32 +++++----- .../language/processing/KeywordCounter.java | 60 +++++-------------- .../language/processing/LongNameCounter.java | 2 +- .../language/processing/SubjectCounter.java | 31 ++++++++-- .../wmsa/edge/index/model/IndexBlock.java | 1 - .../java/org/openzim/ZIMTypes/ZIMReader.java | 11 +++- 7 files changed, 69 insertions(+), 70 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java index 6cee3058..d4c0232e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java @@ -69,7 +69,7 @@ public class DocumentDebugger { Set reps = new HashSet<>(); // kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed)); - kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed)); + kc.count(languageData).forEach(rep -> reps.add(rep.stemmed)); try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index fbe0b8de..570d2462 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -39,13 +39,12 @@ public class DocumentKeywordExtractor { public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) { - var titleWords = extractTitleWords(documentLanguageData); - - var wordsTfIdf = tfIdfCounter.count(documentLanguageData, 0.75); - var wordsNamesRepeated = nameCounter.count(documentLanguageData, 2); - var wordsNamesAll = nameCounter.count(documentLanguageData, 1); - var subjects = subjectCounter.count(documentLanguageData); + List titleWords = extractTitleWords(documentLanguageData); + List wordsTfIdf = tfIdfCounter.count(documentLanguageData); + List wordsNamesRepeated = nameCounter.count(documentLanguageData, 2); + List wordsNamesAll = nameCounter.count(documentLanguageData, 1); + List subjects = subjectCounter.count(documentLanguageData); List wordsLongName = longNameCounter.count(documentLanguageData); int totalSize = wordsTfIdf.size(); @@ -55,8 +54,8 @@ public class DocumentKeywordExtractor { List topKeywords = new ArrayList<>(totalSize / 2); for(var v : wordsTfIdf) { - if (topKeywords.size() < totalSize / 10) topKeywords.add(v); - else if (midKeywords.size() < totalSize / 5) midKeywords.add(v); + if (topKeywords.size() <= totalSize / 10) topKeywords.add(v); + else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v); else lowKeywords.add(v); } @@ -125,17 +124,18 @@ public class DocumentKeywordExtractor { } } - return counts.entrySet().stream().filter(c2 -> c2.getValue()>=1) - .sorted(Comparator.comparing(this::value)) + return counts.entrySet().stream() + .sorted(Comparator.comparing(e -> { + double N = 11820118.; // Number of documents in term freq dictionary + + // Caveat: This is actually the *negated* term score, because the second logarithm has + // its parameter inverted (log(a^b) = b log(a); here b = -1) + return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N); + })) .map(Map.Entry::getKey) - .limit(512).collect(Collectors.toSet()); + .limit(512).collect(Collectors.toCollection(LinkedHashSet::new)); } - private double value(Map.Entry e) { - double N = 11820118.; // Number of documents in term freq dictionary - - return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N); - } public EdgePageWords createWords(IndexBlock block, Collection words) { return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet())); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java index fbe0191c..49cee9bf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java @@ -1,15 +1,12 @@ package nu.marginalia.util.language.processing; import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import java.util.*; import java.util.regex.Pattern; import java.util.stream.Collectors; -import java.util.stream.IntStream; public class KeywordCounter { private final KeywordExtractor keywordExtractor; @@ -20,58 +17,29 @@ public class KeywordCounter { this.keywordExtractor = keywordExtractor; } - public List count(DocumentLanguageData dld, double cutoff) { + public List count(DocumentLanguageData dld) { HashMap counts = new HashMap<>(1000); - HashMap> instances = new HashMap<>(1000); + HashMap> instances = new HashMap<>(1000); - for (int i = 0; i < dld.sentences.length; i++) { - DocumentSentence sent = dld.sentences[i]; - double value = 1.0 / Math.log(1+i); + for (var sent : dld.sentences) { var keywords = keywordExtractor.getKeywordsFromSentence(sent); for (var span : keywords) { - var stemmed = sent.constructStemmedWordFromSpan(span); - if (stemmed.isBlank()) - continue; - counts.merge(stemmed, value, Double::sum); + String stemmed = sent.constructStemmedWordFromSpan(span); - instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(sent.constructWordFromSpan(span)); + counts.merge(stemmed, 1., Double::sum); + instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span)); } } - var topWords = counts.entrySet().stream() - .filter(w -> w.getValue() > cutoff) + return counts.entrySet().stream() + .filter(e -> e.getValue() > 1) .sorted(Comparator.comparing(this::getTermValue)) - .limit(Math.min(100, counts.size()/2)) .map(Map.Entry::getKey) + .flatMap(w -> instances.get(w).stream()) + .filter(w -> w.word.length() > 1) + .limit(150) .collect(Collectors.toList()); - - var topWordsSet = new HashSet<>(topWords); - - final Set keywords = new HashSet<>(); - - for (var sentence : dld.sentences) { - for (WordSpan kw : keywordExtractor.getKeywordsFromSentence(sentence)) { - String stemmedWord = sentence.constructStemmedWordFromSpan(kw); - if (topWords.contains(stemmedWord)) { - keywords.add(new WordRep(sentence, kw)); - } - } - } - - for (var sentence : dld.sentences) { - for (var kw : keywordExtractor.getKeywordsFromSentenceStrict(sentence, topWordsSet, true)) { - keywords.add(new WordRep(sentence, kw)); - } - } - - Map sortOrder = IntStream.range(0, topWords.size()).boxed().collect(Collectors.toMap(topWords::get, i->i)); - - Comparator comp = Comparator.comparing(wr -> sortOrder.getOrDefault(wr.stemmed, topWords.size())); - - var ret = new ArrayList<>(keywords); - ret.sort(comp); - return ret; } private static final Pattern separator = Pattern.compile("_"); @@ -86,7 +54,11 @@ public class KeywordCounter { } double value(String key, double value) { - return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.); + double freq = dict.getTermFreqStemmed(key); + if (freq < 1) { + freq = 10; + } + return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java index 7c976e24..3943e046 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java @@ -56,7 +56,7 @@ public class LongNameCounter { } double value(String key, double value) { - return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.); + return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java index d21b4904..80ff77f5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java @@ -5,7 +5,9 @@ import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import java.util.*; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.stream.Collectors; public class SubjectCounter { @@ -15,6 +17,14 @@ public class SubjectCounter { this.keywordExtractor = keywordExtractor; } + // Seeks out subjects in a sentence by constructs like + // + // [Name] (Verbs) (the|a|Adverb|Verb) ... + // e.g. + // + // Greeks bearing gifts -> Greeks + // Steve McQueen drove fast | cars -> Steve McQueen + public List count(DocumentLanguageData dld) { Map counts = new HashMap<>(); @@ -27,9 +37,10 @@ public class SubjectCounter { || sentence.separators[kw.end + 1] == WordSeparator.COMMA) break; - if (("VBZ".equals(sentence.posTags[kw.end]) || "VBP".equals(sentence.posTags[kw.end])) - && ("DT".equals(sentence.posTags[kw.end + 1]) || "RB".equals(sentence.posTags[kw.end]) || sentence.posTags[kw.end].startsWith("VB")) - ) { + String nextTag = sentence.posTags[kw.end]; + String nextNextTag = sentence.posTags[kw.end+1]; + + if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) { counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum); } } @@ -43,4 +54,16 @@ public class SubjectCounter { .collect(Collectors.toList()); } + private boolean isDetOrAdverbOrVerb(String posTag) { + return "DT".equals(posTag) // determinant + || "RB".equals(posTag) // adverb + || posTag.startsWith("VB") // verb + || posTag.startsWith("JJ"); // adjective + } + + boolean isVerb(String posTag) { + return posTag.startsWith("VB") + && !posTag.equals("VB"); // not interested in the infinitive + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index a347d2e4..14a654d9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -11,7 +11,6 @@ public enum IndexBlock { Meta(7, 7), PositionWords(8, 4.5), NamesWords(9, 5), - TermFreq(10, 10), Topic(11, 0.5); public final int id; diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java index 7706e8d1..3e3bd58f 100644 --- a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java +++ b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java @@ -256,8 +256,8 @@ public class ZIMReader { try { getArticleData(consumer, pos, blobs); } - catch (IOException ex) { - + catch (Exception ex) { + ex.printStackTrace(); } }); @@ -384,7 +384,12 @@ public class ZIMReader { rb = is.read(data, trb, data.length - trb); trb += rb; } - consumer.accept(blobToUrl.get(blobNumber), new String(data)); + try { + consumer.accept(blobToUrl.get(blobNumber), new String(data)); + } + catch (Exception ex) { + ex.printStackTrace(); + } } } System.out.println(clusterNumber + " " + blobToUrl.size()); From ba375ef769fc7de082ab76a1a6d86c4a93b669b4 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 19 Jul 2022 04:50:19 +0200 Subject: [PATCH 36/40] Tweaks to keyword extraction --- .../e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index e04dd71b..993e643e 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -212,9 +212,10 @@ public class EdgeSearchE2ETest extends E2ETestBase { var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - assertEquals(List.of("Frog", "Binomial nomenclature", "Amphibian", "Mantis"), getTitlesFromSearchResults(html)); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search")); + + assertEquals(List.of("Frog", "Binomial nomenclature", "Mantis", "Amphibian"), getTitlesFromSearchResults(html)); + } @Test From fb91ce84f5d26043ab1d9a42d4304f1a1395f970 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 19 Jul 2022 05:08:06 +0200 Subject: [PATCH 37/40] Reduce log spam during conversion --- .../wmsa/edge/converting/processor/DocumentProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 5bc2d274..f1f85e9b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -101,7 +101,7 @@ public class DocumentProcessor { } catch (DisqualifiedException ex) { ret.state = EdgeUrlState.DISQUALIFIED; - logger.info("Disqualified {}: {}", ret.url, ex.reason); + logger.debug("Disqualified {}: {}", ret.url, ex.reason); } catch (Exception ex) { ret.state = EdgeUrlState.DISQUALIFIED; From 51d273e39df7fee3184954e64421e8331201d017 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 20 Jul 2022 11:06:06 +0200 Subject: [PATCH 38/40] Store wiki articles in database instead of in the filesystem. --- .../wmsa/edge/EncyclopediaE2ETest.java | 41 +++--- .../wmsa/edge/assistant/dict/WikiCleaner.java | 39 +++--- .../edge/tools/EncyclopediaLoaderTool.java | 112 +++++++++------ .../wmsa/encyclopedia/EncyclopediaClient.java | 5 - .../wmsa/encyclopedia/EncyclopediaDao.java | 54 ++++---- .../encyclopedia/EncyclopediaService.java | 131 ++---------------- .../main/resources/sql/data-store-init.sql | 9 -- .../main/resources/sql/edge-crawler-cache.sql | 16 +-- .../main/resources/sql/monitor-log-init.sql | 11 -- .../src/main/resources/sql/reference-data.sql | 22 --- 10 files changed, 147 insertions(+), 293 deletions(-) delete mode 100644 marginalia_nu/src/main/resources/sql/data-store-init.sql delete mode 100644 marginalia_nu/src/main/resources/sql/monitor-log-init.sql delete mode 100644 marginalia_nu/src/main/resources/sql/reference-data.sql diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java index 69170aa3..4afa18c4 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java @@ -10,7 +10,7 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import org.mariadb.jdbc.Driver; -import org.openqa.selenium.By; +import org.openqa.selenium.OutputType; import org.openqa.selenium.chrome.ChromeOptions; import org.slf4j.LoggerFactory; import org.testcontainers.containers.*; @@ -23,16 +23,16 @@ import org.testcontainers.utility.MountableFile; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.nio.file.Files; import java.nio.file.Path; import java.sql.DriverManager; import java.sql.SQLException; import java.sql.Types; import java.time.Duration; +import java.time.LocalDateTime; import java.util.concurrent.TimeUnit; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; @Tag("e2e") @Testcontainers @@ -80,12 +80,23 @@ public class EncyclopediaE2ETest extends E2ETestBase { return Path.of(System.getProperty("user.dir")).resolve("data/test"); } + private static Path screenshotFilename(String operation) throws IOException { + var path = Path.of(System.getProperty("user.dir")).resolve("build/test/e2e/"); + Files.createDirectories(path); + + String name = String.format("test-encyclopedia-%s-%s.png", operation, LocalDateTime.now()); + path = path.resolve(name); + + System.out.println("Screenshot in " + path); + return path; + } + @Test - public void run() throws MalformedURLException { + public void run() throws IOException { new Driver(); try (var conn = DriverManager.getConnection(mariaDB.getJdbcUrl(), "wmsa", "wmsa"); - var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_TITLE(NAME,REF_NAME) VALUES (?,?)")) { + var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_ARTICLE(NAME,REF_NAME) VALUES (?,?)")) { stmt.setString(1, "Forg"); stmt.setString(2, "Frog"); @@ -102,24 +113,16 @@ public class EncyclopediaE2ETest extends E2ETestBase { var driver = chrome.getWebDriver(); driver.get("http://proxyNginx/wiki/Frog"); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("get-article")); + + driver.get("http://proxyNginx/wiki/Forg"); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("get-article-redir")); + System.out.println(driver.getTitle()); driver.get("http://proxyNginx/wiki-search?query=Forg"); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("disambig")); System.out.println(driver.getTitle()); - assertTrue(get(encyclopediaContainer.getHost(), - encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), - "/wiki/has?url=Frog", Boolean.class)); - - assertFalse(get(encyclopediaContainer.getHost(), - encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), - "/wiki/has?url=Marginalia", Boolean.class)); - - assertFalse(get(encyclopediaContainer.getHost(), - encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), - "/wiki/has?url=Marginalia", Boolean.class)); - - - var resultsForMarginalia = get(encyclopediaContainer.getHost(), encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), "/encyclopedia/Marginalia", WikiArticles.class); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleaner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleaner.java index 80560be1..cc70f441 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleaner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleaner.java @@ -79,10 +79,7 @@ public class WikiCleaner { } }); - Optional.ofNullable(doc.getElementsByTag("cite")).ifPresent(cite -> cite.forEach(c -> { - c.tagName("span"); - })); - + doc.getElementsByTag("cite").tagName("span"); removeIds(doc, "toc", "catlinks", "Notes", "mw-navigation", "mw-data-after-content", "jump-to-nav"); removeByClass(doc, "mw-references-wrap", "references", "reference", "siteSub", "refbegin"); @@ -205,7 +202,7 @@ public class WikiCleaner { } }); doc.getAllElements().forEach(elem -> { - var classes = elem.classNames().stream().filter(this::isWikiClass).collect(Collectors.toList()); + var classes = elem.classNames().stream().filter(this::isWikiClass).toList(); classes.forEach(elem::removeClass); elem.removeAttr("lang"); elem.removeAttr("dir"); @@ -251,9 +248,8 @@ public class WikiCleaner { var formula = math.getElementsByTag("math"); var converter = net.sourceforge.jeuclid.converter.Converter.getInstance(); var sos = new ByteArrayOutputStream(); - var alt = Optional.ofNullable(formula.attr("alttext")) - .or(() -> Optional.ofNullable(math.getElementsByTag("annotation").text())) - .orElse(""); + var alt = Optional.of(formula.attr("alttext")).filter(s -> !s.isBlank()) + .orElseGet(() -> math.getElementsByTag("annotation").text()); var layoutContext = new LayoutContextImpl(LayoutContextImpl.getDefaultLayoutContext()); @@ -309,16 +305,16 @@ public class WikiCleaner { @NotNull private List> getWikiPageLinks(Document doc) { List> topLinks = new ArrayList<>(); - Optional.ofNullable(doc.select("p a")).ifPresent(links -> links.forEach(atag -> { + doc.select("p a").forEach(atag -> { String href = atag.attr("href"); - if (href != null && !href.isBlank() + if (!href.isBlank() && !href.contains(":") && !href.startsWith("#") ) { topLinks.add(Pair.of(href, atag.attr("title"))); } - })); + }); return topLinks; } @@ -336,19 +332,16 @@ public class WikiCleaner { private List> getDisambiguationLinks(Document doc) { List> disambig = new ArrayList<>(); + for (var note: doc.getElementsByClass("hatnote")) { + for (var atag : note.getElementsByTag("a")) { + String href = atag.attr("href"); + if (atag.hasClass("mw-disambig") && !href.isBlank()) { + disambig.add(Pair.of(href, atag.attr("title"))); + } + } + } + doc.getElementsByClass("hatnote").remove(); - Optional.ofNullable(doc.getElementsByClass("hatnote")).ifPresent(hatnotes -> { - hatnotes.forEach(note -> { - Optional.ofNullable(note.getElementsByTag("a")) - .ifPresent(links -> links.forEach(atag -> { - String href = atag.attr("href"); - if (atag.hasClass("mw-disambig") && href != null) { - disambig.add(Pair.of(href, atag.attr("title"))); - } - })); - }); - }); - Optional.ofNullable(doc.getElementsByClass("hatnote")).ifPresent(Elements::remove); return disambig; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/EncyclopediaLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/EncyclopediaLoaderTool.java index de6eb13b..f3582b12 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/EncyclopediaLoaderTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/EncyclopediaLoaderTool.java @@ -1,59 +1,85 @@ package nu.marginalia.wmsa.edge.tools; -import nu.marginalia.wmsa.configuration.server.Context; +import com.github.luben.zstd.Zstd; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.ParallelPipe; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner; -import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient; +import org.mariadb.jdbc.Driver; import org.openzim.ZIMTypes.ZIMFile; import org.openzim.ZIMTypes.ZIMReader; +import java.io.ByteArrayInputStream; import java.io.IOException; -import java.util.concurrent.*; +import java.nio.charset.StandardCharsets; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; -public class EncyclopediaLoaderTool { +public class EncyclopediaLoaderTool extends ParallelPipe implements AutoCloseable { - static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient(); + public static void main(String[] args) throws IOException, InterruptedException, SQLException { - public static void main(String[] args) throws IOException, InterruptedException { - convertAll(args); - encyclopediaClient.close(); + org.mariadb.jdbc.Driver driver = new Driver(); + + try (var loader = new EncyclopediaLoaderTool(new DatabaseModule().provideConnection())) { + var zr = new ZIMReader(new ZIMFile(args[0])); + + zr.forEachArticles((url, art) -> { + if (art != null) { + loader.accept(new ArticleRaw(url, art)); + } + }, p->true); + + } + catch (Exception ex) { + ex.printStackTrace(); + } System.exit(0); } - private static void convertAll(String[] args) throws IOException, InterruptedException { - var zr = new ZIMReader(new ZIMFile(args[0])); - - var pool = Executors.newFixedThreadPool(8); - var sem = new Semaphore(12); - zr.forEachArticles((url, art) -> { - if (art != null) { - try { - sem.acquire(); - - pool.execute(() -> { - try { - convert(url, art); - } finally { - sem.release(); - } - }); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - } - }, p -> true); - - sem.acquire(12); - - encyclopediaClient.close(); - } - - private static void convert(String url, String art) { - String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art); - - if (null != newData) { - encyclopediaClient.submitWiki(Context.internal(), url, newData) - .retry(5) - .blockingSubscribe(); + public record ArticleRaw(String url, String art) { + public ArticleProcessed toProcessed(String data) { + return new ArticleProcessed(url, data); } } + public record ArticleProcessed(String url, String art) {} + + + private final HikariDataSource dataSource; + private final Connection connection; + private final PreparedStatement insertArticleDataStatement; + + private final WikiCleaner wikiCleaner = new WikiCleaner(); + + public EncyclopediaLoaderTool(HikariDataSource dataSource) throws SQLException { + super("EncyclopediaPipe", 24, 4, 2); + this.dataSource = dataSource; + this.connection = dataSource.getConnection(); + this.insertArticleDataStatement = connection.prepareStatement("REPLACE INTO REF_WIKI_ARTICLE(NAME, ENTRY) VALUES (?, ?)"); + + } + + @Override + protected ArticleProcessed onProcess(ArticleRaw articleRaw) { + return articleRaw.toProcessed(wikiCleaner.cleanWikiJunk("https://en.wikipedia.org/wiki/" + articleRaw.url, articleRaw.art)); + } + + @Override + protected void onReceive(ArticleProcessed articleProcessed) throws Exception { + if (articleProcessed.art == null) return; + + try (var bs = new ByteArrayInputStream(Zstd.compress(articleProcessed.art.getBytes(StandardCharsets.UTF_8)))) { + insertArticleDataStatement.setString(1, articleProcessed.url); + insertArticleDataStatement.setBlob(2, bs); + insertArticleDataStatement.executeUpdate(); + } + } + + public void close() throws Exception { + join(); + if (insertArticleDataStatement != null) insertArticleDataStatement.close(); + if (connection != null) connection.close(); + if (dataSource != null) dataSource.close(); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java index 0b7e5491..c2215526 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java @@ -22,11 +22,6 @@ public class EncyclopediaClient extends AbstractDynamicClient { return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8")); } - @CheckReturnValue - public Observable hasWiki(Context ctx, String url) { - return super.get(ctx, "/wiki/has?url="+ UrlEncoded.encodeString(url), Boolean.class); - } - @CheckReturnValue public Observable encyclopediaLookup(Context ctx, String word) { try { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java index 771f29d3..d9b9d5e9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java @@ -1,5 +1,6 @@ package nu.marginalia.wmsa.encyclopedia; +import com.github.luben.zstd.ZstdInputStream; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; @@ -7,12 +8,13 @@ import nu.marginalia.wmsa.edge.assistant.dict.WikiSearchResult; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.OutputStream; import java.util.*; import java.util.stream.Collectors; public class EncyclopediaDao { - private HikariDataSource dataSource; + private final HikariDataSource dataSource; private static final Logger logger = LoggerFactory.getLogger(EncyclopediaDao.class); @Inject @@ -20,12 +22,29 @@ public class EncyclopediaDao { this.dataSource = dataSource; } + public boolean getWikiArticleData(String name, OutputStream outputStream) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT ENTRY FROM REF_WIKI_ARTICLE WHERE NAME=? AND ENTRY IS NOT NULL")) + { + stmt.setString(1, name); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + new ZstdInputStream(rsp.getBlob(1).getBinaryStream()).transferTo(outputStream); + return true; + } + } + catch (Exception ex) { + logger.error("Failed to fetch article", ex); + } + return false; + } + public WikiArticles encyclopedia(String term) { WikiArticles response = new WikiArticles(); response.entries = new ArrayList<>(); try (var connection = dataSource.getConnection()) { - var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?"); + var stmt = connection.prepareStatement("SELECT DISTINCT(NAME) FROM REF_WIKI_ARTICLE WHERE NAME=?"); stmt.setString(1, term); var rsp = stmt.executeQuery(); @@ -45,7 +64,7 @@ public class EncyclopediaDao { final List matches = new ArrayList<>(); try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) { + try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_ARTICLE WHERE NAME=?")) { stmt.setString(1, term); var rsp = stmt.executeQuery(); @@ -70,31 +89,6 @@ public class EncyclopediaDao { } - public Optional findEncyclopediaPageDirect(String term) { - - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) { - stmt.setString(1, term.replace(' ', '_')); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - String name = rsp.getString(1); - String refName = rsp.getString(2); - - if (refName == null) { - return Optional.of(new WikiSearchResult(name, null)); - } - } - } - } - catch (Exception ex) { - throw new RuntimeException(ex); - } - - return Optional.empty(); - } - public List findEncyclopediaPages(String term) { final List directMatches = new ArrayList<>(); final Set directSearchMatches = new HashSet<>(); @@ -102,7 +96,7 @@ public class EncyclopediaDao { try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) { + try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_ARTICLE WHERE NAME=?")) { stmt.setString(1, term.replace(' ', '_')); var rsp = stmt.executeQuery(); @@ -118,7 +112,7 @@ public class EncyclopediaDao { } } - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) { + try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_ARTICLE WHERE NAME LIKE ? LIMIT 10")) { stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%"); var rsp = stmt.executeQuery(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java index 156e2215..e2f0ab94 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java @@ -5,7 +5,6 @@ import com.google.gson.GsonBuilder; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; @@ -17,38 +16,27 @@ import spark.Request; import spark.Response; import spark.Spark; -import java.io.FileInputStream; -import java.io.FileOutputStream; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; public class EncyclopediaService extends Service { private static final Logger logger = LoggerFactory.getLogger(EncyclopediaService.class); private final MustacheRenderer wikiErrorPageRenderer; private final MustacheRenderer wikiSearchResultRenderer; - private final Gson gson = new GsonBuilder().create(); - private Path wikiPath; - private EncyclopediaDao encyclopediaDao; + private final EncyclopediaDao encyclopediaDao; @Inject public EncyclopediaService(@Named("service-host") String ip, @Named("service-port") Integer port, - @Named("wiki-path") Path wikiPath, EncyclopediaDao encyclopediaDao, RendererFactory rendererFactory, Initialization initialization, MetricsServer metricsServer) throws IOException { + super(ip, port, initialization, metricsServer); - this.wikiPath = wikiPath; this.encyclopediaDao = encyclopediaDao; if (rendererFactory != null) { @@ -60,12 +48,10 @@ public class EncyclopediaService extends Service { wikiSearchResultRenderer = null; } + Gson gson = new GsonBuilder().create(); Spark.get("/public/wiki/*", this::getWikiPage); Spark.get("/public/wiki-search", this::searchWikiPage); - - Spark.get("/wiki/has", this::pathWikiHas); - Spark.post("/wiki/submit", this::pathWikiSubmit); Spark.get("/encyclopedia/:term", (rq, rsp) -> encyclopediaDao.encyclopedia(rq.params("term")), gson::toJson); Spark.awaitInitialization(); @@ -74,44 +60,25 @@ public class EncyclopediaService extends Service { @SneakyThrows private Object getWikiPage(Request req, Response rsp) { final String[] splats = req.splat(); + if (splats.length == 0) rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html"); - final String name = splats[0]; String pageName = encyclopediaDao.resolveEncylopediaRedirect(name).orElse(name); - logger.info("Resolved {} -> {}", name, pageName); - return wikiGet(pageName) - .or(() -> resolveWikiPageNameWrongCase(name)) - .orElseGet(() -> renderSearchPage(name)); - } - - private Optional resolveWikiPageNameWrongCase(String name) { - var rsp = encyclopediaDao.findEncyclopediaPageDirect(name); - - if (rsp.isEmpty()) { - return Optional.of(renderSearchPage(name)); + if (!encyclopediaDao.getWikiArticleData(name, rsp.raw().getOutputStream())) { + return wikiErrorPageRenderer.render("https://en.wikipedia.org/wiki/" + name); } - - name = rsp.get().getInternalName(); - return wikiGet(name); - } - - private String renderSearchPage(String s) { - return wikiSearchResultRenderer.render( - Map.of("query", s, - "error", "true", - "results", encyclopediaDao.findEncyclopediaPages(s))); + return ""; } @SneakyThrows private Object searchWikiPage(Request req, Response rsp) { - final var ctx = Context.fromRequest(req); - String term = req.queryParams("query"); + if (null == term) { rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html"); return ""; @@ -124,86 +91,4 @@ public class EncyclopediaService extends Service { ); } - - - private Path getWikiFilename(Path base, String url) { - Path p = base; - - int urlHash = url.hashCode(); - - p = p.resolve(Integer.toString(urlHash & 0xFF)); - p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF)); - p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF)); - p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF)); - - String fileName = url.chars() - .mapToObj(this::encodeUrlChar) - .collect(Collectors.joining()); - - if (fileName.length() > 128) { - fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL); - } - - return p.resolve(fileName + ".gz"); - } - - - private String encodeUrlChar(int i) { - if (i >= 'a' && i <= 'z') { - return Character.toString(i); - } - if (i >= 'A' && i <= 'Z') { - return Character.toString(i); - } - if (i >= '0' && i <= '9') { - return Character.toString(i); - } - if (i == '.') { - return Character.toString(i); - } - else { - return String.format("%%%2X", i); - } - } - - @SneakyThrows - private Object pathWikiHas(Request request, Response response) { - return Files.exists(getWikiFilename(wikiPath, request.queryParams("url"))); - } - - - @SneakyThrows - private Optional wikiGet(String name) { - - var filename = getWikiFilename(wikiPath, name); - - if (Files.exists(filename)) { - try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) { - return Optional.of(new String(stream.readAllBytes())); - } - } else { - return Optional.empty(); - } - } - - - @SneakyThrows - private Object pathWikiSubmit(Request request, Response response) { - byte[] data = request.bodyAsBytes(); - - String wikiUrl = request.queryParams("url"); - Path filename = getWikiFilename(wikiPath, wikiUrl); - - Files.createDirectories(filename.getParent()); - - logger.debug("Writing {} to {}", wikiUrl, filename); - - try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) { - gos.write(data); - gos.flush(); - } - - return "ok"; - - } } diff --git a/marginalia_nu/src/main/resources/sql/data-store-init.sql b/marginalia_nu/src/main/resources/sql/data-store-init.sql deleted file mode 100644 index b47402b9..00000000 --- a/marginalia_nu/src/main/resources/sql/data-store-init.sql +++ /dev/null @@ -1,9 +0,0 @@ -DROP TABLE IF EXISTS JSON_DATA; - -CREATE TABLE IF NOT EXISTS JSON_DATA( - DOM VARCHAR(255), - ID VARCHAR(255), - MODEL VARCHAR(255), - DATA MEDIUMTEXT); - -CREATE INDEX IF NOT EXISTS JSON_DATA_INDEX ON JSON_DATA (DOM, ID, MODEL); \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 120a1ce2..09203a2b 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -190,7 +190,7 @@ CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP); DROP TABLE IF EXISTS REF_DICTIONARY; -CREATE TABLE IF NOT EXISTS REF_DICTIONARY( +CREATE TABLE IF NOT EXISTS REF_DICTIONARY ( TYPE VARCHAR(16), WORD VARCHAR(255), DEFINITION VARCHAR(255) @@ -198,15 +198,15 @@ CREATE TABLE IF NOT EXISTS REF_DICTIONARY( CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; +---; + CREATE INDEX IF NOT EXISTS REF_DICTIONARY_WORD ON REF_DICTIONARY (WORD); -CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE ( - NAME VARCHAR(255), - NAME_LOWER VARCHAR(255) GENERATED ALWAYS AS (LOWER(NAME)), - REF_NAME VARCHAR(255) +CREATE TABLE IF NOT EXISTS REF_WIKI_ARTICLE ( + NAME VARCHAR(255) PRIMARY KEY, + REF_NAME VARCHAR(255) COMMENT "If this is a redirect, it redirects to this REF_WIKI_ARTICLE.NAME", + ENTRY LONGBLOB ) +ROW_FORMAT=DYNAMIC CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; - -CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER); -CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME); \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/sql/monitor-log-init.sql b/marginalia_nu/src/main/resources/sql/monitor-log-init.sql deleted file mode 100644 index 23ca4fca..00000000 --- a/marginalia_nu/src/main/resources/sql/monitor-log-init.sql +++ /dev/null @@ -1,11 +0,0 @@ -DROP TABLE IF EXISTS MONITOR_LOG; -DROP INDEX IF EXISTS MONITOR_LOG_INDEX; - -CREATE TABLE IF NOT EXISTS LOG_ENTRY ( - SERVICE VARCHAR(32), - STATUS VARCHAR(32), - IP VARCHAR(32), - PORT INTEGER, - TS VARCHAR(32)); - -CREATE INDEX IF NOT EXISTS MONITOR_LOG_INDEX ON LOG_ENTRY (SERVICE); \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/sql/reference-data.sql b/marginalia_nu/src/main/resources/sql/reference-data.sql deleted file mode 100644 index 733504ac..00000000 --- a/marginalia_nu/src/main/resources/sql/reference-data.sql +++ /dev/null @@ -1,22 +0,0 @@ -DROP TABLE IF EXISTS REF_DICTIONARY; - -CREATE TABLE IF NOT EXISTS REF_DICTIONARY( - TYPE VARCHAR(16), - WORD VARCHAR(255), - DEFINITION VARCHAR(255) -) -CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; - -CREATE INDEX IF NOT EXISTS REF_DICTIONARY_WORD ON REF_DICTIONARY (WORD); - -CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE( - NAME VARCHAR(255), - NAME_LOWER VARCHAR(255) GENERATED ALWAYS AS (LOWER(NAME)), - REF_NAME VARCHAR(255) -) -CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; - -CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER); -CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME); \ No newline at end of file From 6d1e2442b6f1f08b2caccaf4db41966903bda599 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 20 Jul 2022 11:06:06 +0200 Subject: [PATCH 39/40] Store wiki articles in database instead of in the filesystem. --- .../marginalia/wmsa/encyclopedia/EncyclopediaModule.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java index 39304638..0f46b164 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java @@ -1,18 +1,11 @@ package nu.marginalia.wmsa.encyclopedia; import com.google.inject.AbstractModule; -import com.google.inject.name.Names; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.WmsaHome; - -import java.nio.file.Path; public class EncyclopediaModule extends AbstractModule { @SneakyThrows @Override public void configure() { - bind(Path.class) - .annotatedWith(Names.named("wiki-path")) - .toInstance(WmsaHome.getDisk("encyclopedia")); } } From 48812d8a4f4a955c70907967f6319dce111b71d8 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 20 Jul 2022 12:02:26 +0200 Subject: [PATCH 40/40] Store screenshots in database instead of in the filesystem. --- .../screenshot/ScreenshotLoaderMain.java | 40 +++++ .../screenshot/ScreenshotService.java | 156 ++++++++---------- .../main/resources/sql/edge-crawler-cache.sql | 11 ++ .../configuration/server/ServiceTest.java | 2 +- .../wmsa/edge/assistant/AssistantTest.java | 5 +- 5 files changed, 121 insertions(+), 93 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotLoaderMain.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotLoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotLoaderMain.java new file mode 100644 index 00000000..a69420f4 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotLoaderMain.java @@ -0,0 +1,40 @@ +package nu.marginalia.wmsa.edge.assistant.screenshot; + +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.mariadb.jdbc.Driver; + +import java.io.FileInputStream; +import java.io.IOException; +import java.sql.SQLException; +import java.util.zip.GZIPInputStream; + +public class ScreenshotLoaderMain { + public static void main(String... args) throws IOException { + + org.mariadb.jdbc.Driver driver = new Driver(); + var ds = new DatabaseModule().provideConnection(); + + try (var tis = new TarArchiveInputStream(new GZIPInputStream(new FileInputStream(args[0]))); + var conn = ds.getConnection(); + var ps = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)") + ) { + for (TarArchiveEntry entry = tis.getNextTarEntry(); entry != null; entry = tis.getNextTarEntry()) { + if (entry.isFile()) { + String fileName = entry.getName(); + String domainName = fileName.substring(fileName.indexOf('/')+1, fileName.lastIndexOf('.')); + + ps.setString(1, domainName); + ps.setString(2, "image/webp"); + ps.setBlob(3, tis); + ps.executeUpdate(); + + System.out.println(domainName); + } + } + } catch (SQLException e) { + e.printStackTrace(); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java index 59a719b2..2b562a83 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java @@ -2,47 +2,49 @@ package nu.marginalia.wmsa.edge.assistant.screenshot; import com.google.common.base.Strings; import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; -import spark.utils.IOUtils; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.NoSuchElementException; +import java.sql.SQLException; import static java.lang.Integer.parseInt; public class ScreenshotService { - private final Path screenshotsRoot = Path.of("/var/lib/wmsa/archive/screenshots/screenshots/"); - private final Path screenshotsRootWebp = Path.of("/var/lib/wmsa/archive.fast/screenshots/"); private final EdgeDataStoreDao edgeDataStoreDao; - private final long MIN_FILE_SIZE = 4096; + private final HikariDataSource dataSource; + + private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public ScreenshotService(EdgeDataStoreDao edgeDataStoreDao) { + public ScreenshotService(EdgeDataStoreDao edgeDataStoreDao, HikariDataSource dataSource) { this.edgeDataStoreDao = edgeDataStoreDao; + this.dataSource = dataSource; } public boolean hasScreenshot(EdgeId domainId) { - EdgeDomain domain = edgeDataStoreDao.getDomain(domainId); - - Path p = getScreenshotPath(screenshotsRootWebp, domain, ".webp"); - if (p == null) { - p = getScreenshotPath(screenshotsRoot, domain, ".png"); + try (var conn = dataSource.getConnection(); + var ps = conn.prepareStatement(""" + SELECT TRUE + FROM DATA_DOMAIN_SCREENSHOT + INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME + WHERE EC_DOMAIN.ID=? + """)) { + ps.setInt(1, domainId.id()); + var rs = ps.executeQuery(); + return rs.next(); } - - try { - return p != null && Files.size(p) >= MIN_FILE_SIZE; - } catch (IOException e) { - return false; + catch (SQLException ex) { + logger.warn("SQL error", ex); } + return false; } @SneakyThrows @@ -54,79 +56,55 @@ public class ScreenshotService { int id = parseInt(request.params("id")); - Path p = null; - if (id == 0) { - p = screenshotsRootWebp.resolve("dummy-snapshot.webp"); - } else { - EdgeDomain domain; - try { - domain = edgeDataStoreDao.getDomain(new EdgeId<>(id)); - p = getScreenshotPath(screenshotsRootWebp, domain, ".webp"); - if (p == null) { - p = getScreenshotPath(screenshotsRoot, domain, ".png"); - } - - if (p != null && Files.size(p) <= MIN_FILE_SIZE) { - p = null; - } - } catch (NoSuchElementException ex) { - domain = new EdgeDomain("error.example.com"); - } - - if (p == null) { - response.type("image/svg+xml"); - - return String.format("\n" + - "\n" + - " \n" + - " \n" + - " Placeholder\n" + - " %s\n" + - " \n" + - "\n", domain); + try (var conn = dataSource.getConnection(); + var ps = conn.prepareStatement(""" + SELECT CONTENT_TYPE, DATA + FROM DATA_DOMAIN_SCREENSHOT + INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME + WHERE EC_DOMAIN.ID=? + """)) { + ps.setInt(1, id); + var rsp = ps.executeQuery(); + if (rsp.next()) { + response.type(rsp.getString(1)); + rsp.getBlob(2).getBinaryStream().transferTo(response.raw().getOutputStream()); + return ""; } } - response.status(200); - response.header("Cache-control", "public,max-age=3600"); - if (p.toString().endsWith("webp")) { - response.type("image/webp"); - } else { - response.type("image/png"); + catch (SQLException ex) { + logger.warn("SQL error", ex); } - IOUtils.copy(new ByteArrayInputStream(Files.readAllBytes(p)), response.raw().getOutputStream()); - return ""; + + return serveSvgPlaceholder(response, id); } - private Path getScreenshotPath(Path root, EdgeDomain domain, String ending) { - - var p = root.resolve(domain.toString() + ending); - if (!p.normalize().startsWith(root)) { - return null; - } - - if (!Files.exists(p)) { - return null; - } - - return p; + private Object serveSvgPlaceholder(Response response, int id) { + response.type("image/svg+xml"); + return String.format("\n" + + "\n" + + " \n" + + " \n" + + " Placeholder\n" + + " %s\n" + + " \n" + + "\n", edgeDataStoreDao.getDomain(new EdgeId<>(id))); } - } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 09203a2b..b5dfaa17 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -210,3 +210,14 @@ CREATE TABLE IF NOT EXISTS REF_WIKI_ARTICLE ( ROW_FORMAT=DYNAMIC CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; + +---; + +CREATE TABLE IF NOT EXISTS DATA_DOMAIN_SCREENSHOT ( + DOMAIN_NAME VARCHAR(255) PRIMARY KEY, + CONTENT_TYPE ENUM ('image/png', 'image/webp', 'image/svg+xml') NOT NULL, + DATA LONGBLOB NOT NULL +) +ROW_FORMAT=DYNAMIC +CHARACTER SET utf8mb4 +COLLATE utf8mb4_unicode_ci; \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java index 17dd7472..124c7826 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java @@ -50,7 +50,7 @@ class ServiceTest { new DictionaryService(dataSource, new SpellChecker()), new MathParser(), new Units(new MathParser()), - new ScreenshotService(null), null); + new ScreenshotService(null, dataSource), null); Spark.awaitInitialization(); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java index 21f82bfc..7e1571f7 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java @@ -6,12 +6,11 @@ import nu.marginalia.util.TestUtil; import nu.marginalia.wmsa.client.exception.RemoteException; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; import nu.marginalia.wmsa.edge.assistant.dict.DictionaryService; import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; import nu.marginalia.wmsa.edge.assistant.eval.MathParser; import nu.marginalia.wmsa.edge.assistant.eval.Units; -import nu.marginalia.wmsa.edge.assistant.EdgeAssistantService; -import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; import nu.marginalia.wmsa.edge.search.UnitConversion; import org.junit.jupiter.api.*; @@ -62,7 +61,7 @@ class AssistantTest { new DictionaryService(dataSource, new SpellChecker()), new MathParser(), new Units(new MathParser()), - new ScreenshotService(null), null); + new ScreenshotService(null, dataSource), null); Spark.awaitInitialization(); }