Links
- Nominal Quality: {{nominalQuality}}%
Crawl Ranking: {{ranking}}%
Incoming Links: {{incomingLinks}}
Outbound Links: {{outboundLinks}}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java
index 1915d989..875cda37 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java
@@ -90,10 +90,10 @@ class BTreeWriterTest {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write(0, toPut.size(), (offset) -> {
+ writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + 2L*i, data[i]);
- mmf.put(offset + 2L*i + 1, i);
+ slice.put(2L*i, data[i]);
+ slice.put( 2L*i + 1, i);
}
});
mmf.force();
@@ -133,10 +133,10 @@ class BTreeWriterTest {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write( 0, toPut.size(), (offset) -> {
+ writer.write( 0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + 2L*i, data[i]);
- mmf.put(offset + 2L*i + 1, i);
+ slice.put(2L*i, data[i]);
+ slice.put(2L*i + 1, i);
}
});
mmf.force();
@@ -182,9 +182,9 @@ class BTreeWriterTest {
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write(0, toPut.size(), (offset) -> {
+ writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + i, data[i]);
+ slice.put(i, data[i]);
}
});
mmf.force();
@@ -235,9 +235,9 @@ class BTreeWriterTest {
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write(0, toPut.size(), (offset) -> {
+ writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + i, data[i]);
+ slice.put(i, data[i]);
}
});
mmf.force();
@@ -288,10 +288,10 @@ class BTreeWriterTest {
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write(0, toPut.size(), (offset) -> {
+ writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + i*2L, data[i]);
- mmf.put(offset + i*2L+1, i);
+ slice.put(i*2L, data[i]);
+ slice.put(i*2L+1, i);
}
});
mmf.force();
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java
index 326c9b15..9331a998 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java
@@ -27,7 +27,7 @@ class LongPairHashMapTest {
try {
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
- var lphm = new LongPairHashMap(mmf, 1024);
+ var lphm = LongPairHashMap.createNew(mmf, 1024);
toPut.forEach(i -> {
lphm.put(new LongPairHashMap.CellData(i, i));
});
@@ -36,7 +36,7 @@ class LongPairHashMapTest {
RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw");
MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
- var lphm2 = new LongPairHashMap(mmf2);
+ var lphm2 = LongPairHashMap.loadExisting(mmf2);
toPut.forEach(i -> {
Assertions.assertTrue(lphm2.get(i).isSet());
Assertions.assertEquals(i, (int) lphm2.get(i).getKey());
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
index b6e61aa2..961d8304 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
@@ -1,11 +1,11 @@
package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
-import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
index 6b029da9..2b2da0fd 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
@@ -3,14 +3,14 @@ package nu.marginalia.wmsa.edge.index.service;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.TestUtil;
-import nu.marginalia.wmsa.client.exception.RemoteException;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.index.EdgeIndexService;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
+import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.EdgeId;
@@ -23,7 +23,6 @@ import org.junit.jupiter.api.parallel.ResourceAccessMode;
import org.junit.jupiter.api.parallel.ResourceLock;
import spark.Spark;
-import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
@@ -31,7 +30,6 @@ import java.util.List;
import java.util.stream.Collectors;
import static nu.marginalia.util.TestUtil.getConnection;
-import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
index 6b219bad..edcfa71f 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
@@ -1,14 +1,14 @@
package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndex;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
-import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
-import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
+import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
+import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
+import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.model.EdgeId;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java
index 4aa9bceb..65b1ad57 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java
@@ -13,6 +13,7 @@ class QueryVariantsTest {
QueryVariants variants;
QueryParser parser;
SentenceExtractor se;
+
@BeforeEach
public void setUp() {
LanguageModels lm = TestLanguageModels.getLanguageModels();
@@ -24,7 +25,7 @@ class QueryVariantsTest {
parser = new QueryParser(new EnglishDictionary(dict), variants);
}
- @Test
+ @Test @SuppressWarnings("unchecked")
void getQueryVariants() {
System.out.println(se.extractSentence("we are alone"));
testCase("DOS", List.of("DOS"));
@@ -50,7 +51,5 @@ class QueryVariantsTest {
private void testCase(String input, List... expected) {
var tokens = variants.getQueryVariants(parser.extractBasicTokens(input));
System.out.println(tokens);
-// var result = tokens.stream().map(lst -> lst.terms).collect(Collectors.toSet());
-// assertEquals(Set.of(expected), result, "Case failed: " + input);
}
}
\ No newline at end of file
From 81c77e7fcb2c5d31f13841462910003d745783e9 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Wed, 15 Jun 2022 16:49:18 +0200
Subject: [PATCH 25/27] Revert "Merge branch 'experimental' into master"
This reverts commit c3a432fdd42c4c08e271cf0a3c45f589cfe2bfb9, reversing
changes made to 1de63f225d9d425ee89741e5a3fa1b00893c5c5b.
---
marginalia_nu/build.gradle | 31 +-
.../wmsa/edge/EdgeSearchE2ETest.java | 23 +-
marginalia_nu/src/e2e/resources/init.sh | 2 +-
.../nu/marginalia/util/btree/BTreeWriter.java | 16 +-
.../marginalia/util/btree/WriteCallback.java | 4 +-
.../util/btree/model/BTreeHeader.java | 3 +-
.../marginalia/util/hash/LongPairHashMap.java | 45 +-
.../util/multimap/MultimapFileLong.java | 11 +-
.../multimap/MultimapFileLongOffsetSlice.java | 70 ---
.../util/multimap/MultimapFileLongSlice.java | 29 -
.../util/multimap/MultimapSearcher.java | 4 +-
.../util/multimap/MultimapSorter.java | 4 +-
.../marginalia/util/ranking/AcademiaRank.java | 49 ++
.../util/ranking/BetterReversePageRank.java | 8 +-
.../util/ranking/BetterStandardPageRank.java | 8 +-
.../util/ranking/BuggyReversePageRank.java | 8 +-
.../util/ranking/BuggyStandardPageRank.java | 8 +-
.../util/ranking/RankingAlgorithm.java | 303 ++++++---
.../util/ranking/RankingDomainData.java | 33 -
.../util/ranking/RankingDomainFetcher.java | 105 ----
.../ranking/old/OldReversePageRankV2.java | 4 +-
.../util/ranking/old/StandardPageRank.java | 4 +-
.../util/ranking/tool/DedupTool.java | 2 +-
.../util/ranking/tool/PerusePageRankV2.java | 4 +-
.../ranking/tool/TestAcademiaRankTool.java | 30 +
.../ranking/tool/UpdateDomainRanksTool.java | 14 +-
.../ranking/tool/UpdateDomainRanksTool2.java | 12 +-
.../edge/converting/ReindexTriggerMain.java | 4 +-
.../converting/interpreter/Interpreter.java | 2 +-
.../instruction/LoadProcessedDomain.java | 4 +-
.../wmsa/edge/converting/loader/Loader.java | 6 +-
.../converting/loader/SqlLoadDomainLinks.java | 6 +-
.../converting/loader/SqlLoadDomains.java | 27 +-
.../loader/SqlLoadProcessedDocument.java | 24 +-
.../loader/SqlLoadProcessedDomain.java | 27 +-
.../edge/converting/loader/SqlLoadUrls.java | 20 +-
.../processor/InstructionsCompiler.java | 2 +-
.../edge/crawling/CrawlJobExtractorMain.java | 11 +-
.../CrawlJobExtractorPageRankMain.java | 15 +-
.../wmsa/edge/data/dao/EdgeDataStoreDao.java | 32 +-
.../edge/data/dao/EdgeDataStoreDaoImpl.java | 588 ++++++++++++++++--
.../dao/task/EdgeDomainBlacklistImpl.java | 2 +-
.../wmsa/edge/index/EdgeIndexControl.java | 9 +-
.../wmsa/edge/index/EdgeIndexService.java | 6 +-
.../wmsa/edge/index/IndexServicesFactory.java | 16 +-
.../words/WordIndexLengthsTable.java | 10 -
.../words/WordIndexOffsetsTable.java | 67 --
.../conversion/words/WordIndexTables.java | 56 --
.../conversion/words/WordsTableWriter.java | 75 ---
.../index/{ => radix}/EdgeIndexBucket.java | 10 +-
.../SearchEngineRanking.java | 2 +-
.../SearchIndexDao.java | 43 +-
.../{reader => service}/SearchIndexes.java | 10 +-
.../wmsa/edge/index/service/SearchOrder.java | 6 +
.../dictionary/DictionaryReader.java | 2 +-
.../dictionary/DictionaryWriter.java | 2 +-
.../dictionary/TokenCompressor.java | 2 +-
.../ConversionUnnecessaryException.java | 2 +-
.../index}/SearchIndex.java | 6 +-
.../index}/SearchIndexConverter.java | 108 ++--
.../index}/SearchIndexPreconverter.java | 3 +-
.../index}/SearchIndexReader.java | 10 +-
.../index}/SearchIndexWriter.java | 2 +-
.../index}/SearchIndexWriterImpl.java | 4 +-
.../index/wordstable/BtreeWordsTable.java} | 90 +--
.../index/wordstable/IndexWordsTable.java | 48 ++
.../index/wordstable/WordsTableWriter.java | 85 +++
.../query/IndexQueryBuilder.java | 4 +-
.../query/IndexSearchBudget.java | 2 +-
.../{reader => service}/query/Query.java | 2 +-
.../query}/SearchIndexPartitioner.java | 4 +-
.../wmsa/edge/model/EdgeDomain.java | 5 +-
.../model/crawl/EdgeDomainIndexingState.java | 31 +-
.../model/search/EdgeSearchSpecification.java | 4 +-
.../edge/model/search/EdgeUrlDetails.java | 19 +-
.../wmsa/edge/search/EdgeSearchOperator.java | 3 +-
.../wmsa/edge/search/EdgeSearchProfile.java | 17 +-
.../command/commands/SiteSearchCommand.java | 4 +-
.../edge/search/model/DomainInformation.java | 1 +
.../wmsa/edge/search/query/QueryFactory.java | 1 +
.../search/results/SearchResultDecorator.java | 2 +-
.../siteinfo/DomainInformationService.java | 226 +------
.../wmsa/edge/tools/IndexMergerMain.java | 9 +-
.../main/resources/sql/edge-crawler-cache.sql | 176 ++++--
.../templates/edge/site-info-gmi.hdb | 1 +
.../resources/templates/edge/site-info.hdb | 1 +
.../java/nu/marginalia/util/TestUtil.java | 2 +-
.../util/btree/BTreeWriterTest.java | 26 +-
.../util/hash/LongPairHashMapTest.java | 4 +-
.../loader/SqlLoadDomainLinksTest.java | 48 --
.../converting/loader/SqlLoadDomainsTest.java | 52 --
.../loader/SqlLoadProcessedDocumentTest.java | 94 ---
.../loader/SqlLoadProcessedDomainTest.java | 54 --
.../converting/loader/SqlLoadUrlsTest.java | 50 --
.../index/service/DictionaryWriterTest.java | 8 +-
.../index/service/EdgeIndexClientTest.java | 6 +-
.../service/SearchIndexConverterTest.java | 89 +++
.../index/service/SearchIndexWriterTest.java | 14 +-
.../index/service/TokenCompressorTest.java | 2 +-
.../edge/search/query/QueryVariantsTest.java | 5 +-
100 files changed, 1667 insertions(+), 1577 deletions(-)
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{ => radix}/EdgeIndexBucket.java (93%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service}/SearchEngineRanking.java (97%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service}/SearchIndexDao.java (64%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service}/SearchIndexes.java (91%)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{ => service}/dictionary/DictionaryReader.java (92%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{ => service}/dictionary/DictionaryWriter.java (99%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{ => service}/dictionary/TokenCompressor.java (97%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service/index}/ConversionUnnecessaryException.java (80%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service/index}/SearchIndex.java (93%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service/index}/SearchIndexConverter.java (75%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service/index}/SearchIndexPreconverter.java (97%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service/index}/SearchIndexReader.java (96%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{journal => service/index}/SearchIndexWriter.java (88%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{journal => service/index}/SearchIndexWriterImpl.java (96%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader/IndexWordsTable.java => service/index/wordstable/BtreeWordsTable.java} (58%)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service}/query/IndexQueryBuilder.java (97%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service}/query/IndexSearchBudget.java (87%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service}/query/Query.java (73%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service/query}/SearchIndexPartitioner.java (96%)
delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java
delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java
delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java
delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java
delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java
create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java
diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle
index eb553649..b2115fb0 100644
--- a/marginalia_nu/build.gradle
+++ b/marginalia_nu/build.gradle
@@ -59,12 +59,12 @@ dependencies {
implementation "com.sparkjava:spark-core:2.9.3"
implementation 'com.opencsv:opencsv:5.6'
- implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
- implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
- implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
- implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
- implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
- implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
+ implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1'
+ implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1'
+ implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1'
+ implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1'
+ implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1'
+ implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1'
implementation 'org.slf4j:slf4j-api:1.7.36'
@@ -76,6 +76,7 @@ dependencies {
implementation 'com.github.ThatJavaNerd:JRAW:1.1.0'
implementation group: 'com.h2database', name: 'h2', version: '2.1.210'
+ testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.3.1'
implementation 'org.jsoup:jsoup:1.14.3'
implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2'
@@ -85,7 +86,7 @@ dependencies {
implementation 'com.zaxxer:HikariCP:5.0.1'
- implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
+ implementation 'org.apache.opennlp:opennlp-tools:1.9.3'
implementation 'io.prometheus:simpleclient:0.15.0'
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
@@ -122,19 +123,15 @@ dependencies {
testImplementation 'org.projectlombok:lombok:1.18.24'
testAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
- testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1'
-
- testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2')
- testImplementation 'org.testcontainers:mariadb:1.17.2'
- testImplementation "org.testcontainers:junit-jupiter:1.17.2"
-
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
- e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
- e2eTestImplementation 'org.testcontainers:nginx:1.17.2'
- e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
- e2eTestImplementation "org.testcontainers:selenium:1.17.2"
+ e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22'
+ e2eTestImplementation 'org.testcontainers:mariadb:1.17.1'
+ e2eTestImplementation 'org.testcontainers:nginx:1.17.1'
+ e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1'
+ e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1"
+ e2eTestImplementation "org.testcontainers:selenium:1.17.1"
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
}
diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
index 08408de2..af43e462 100644
--- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
+++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
@@ -28,7 +28,6 @@ import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
-import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("e2e")
@Testcontainers
@@ -157,16 +156,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
return wikipediaFiles.toString();
}
- private List getTitlesFromSearchResults(String html) {
- List ret = new ArrayList<>();
-
- for (var title : Jsoup.parse(html).select(".card.search-result > h2")) {
- ret.add(title.text());
- }
-
- return ret;
- }
-
@Test
public void testFrontPage() throws IOException {
var driver = chrome.getWebDriver();
@@ -184,9 +173,8 @@ public class EdgeSearchE2ETest extends E2ETestBase {
driver.get("http://proxyNginx/search?query=bird&profile=corpo");
System.out.println(driver.getTitle());
+ System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
- var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
- assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
}
@@ -199,23 +187,20 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
+
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info"));
}
-
@Test
public void testSiteSearch() throws IOException {
var driver = chrome.getWebDriver();
driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog");
System.out.println(driver.getTitle());
+ System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
- var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
-
- assertEquals(List.of("Frog", "Binomial nomenclature", "Amphibian", "Mantis"), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
}
-
@Test
public void testBrowse() throws IOException {
var driver = chrome.getWebDriver();
@@ -224,6 +209,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
+
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
}
@Test
@@ -234,6 +220,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
+
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
}
@Test
diff --git a/marginalia_nu/src/e2e/resources/init.sh b/marginalia_nu/src/e2e/resources/init.sh
index 50dbd406..5409f787 100644
--- a/marginalia_nu/src/e2e/resources/init.sh
+++ b/marginalia_nu/src/e2e/resources/init.sh
@@ -69,4 +69,4 @@ memex memex
dating dating
EOF
-WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
\ No newline at end of file
+WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
\ No newline at end of file
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java
index b43faca7..28ac4914 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java
@@ -3,7 +3,6 @@ package nu.marginalia.util.btree;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
-import nu.marginalia.util.multimap.MultimapFileLongSlice;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -13,9 +12,9 @@ import java.io.IOException;
public class BTreeWriter {
private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class);
private final BTreeContext ctx;
- private final MultimapFileLongSlice map;
+ private final MultimapFileLong map;
- public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
+ public BTreeWriter(MultimapFileLong map, BTreeContext ctx) {
this.map = map;
this.ctx = ctx;
}
@@ -32,18 +31,13 @@ public class BTreeWriter {
return size;
}
- /** Construct a BTree with numEntries entries at offset in the associated map
- *
- * @return The size of the written data
- */
- public long write(long offset, int numEntries, WriteCallback writeIndexCallback)
+ public long write(long offset, int numEntries, WriteCallback writeIndex)
throws IOException
{
- BTreeHeader header = makeHeader(offset, numEntries);
+ var header = makeHeader(offset, numEntries);
header.write(map, offset);
-
- writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
+ writeIndex.write(header.dataOffsetLongs());
if (header.layers() < 1) {
return ctx.calculateSize(numEntries);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java
index a6225db1..70bd8132 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java
@@ -1,9 +1,7 @@
package nu.marginalia.util.btree;
-import nu.marginalia.util.multimap.MultimapFileLongSlice;
-
import java.io.IOException;
public interface WriteCallback {
- void write(MultimapFileLongSlice slice) throws IOException;
+ void write(long offset) throws IOException;
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java
index 8d68b424..4951f5b8 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java
@@ -1,7 +1,6 @@
package nu.marginalia.util.btree.model;
import nu.marginalia.util.multimap.MultimapFileLong;
-import nu.marginalia.util.multimap.MultimapFileLongSlice;
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
public BTreeHeader {
@@ -29,7 +28,7 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon
return padding;
}
- public void write(MultimapFileLongSlice dest, long offset) {
+ public void write(MultimapFileLong dest, long offset) {
dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
dest.put(offset+1, indexOffsetLongs);
dest.put(offset+2, dataOffsetLongs);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java
index d1e056b9..6f8912a9 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java
@@ -1,7 +1,9 @@
package nu.marginalia.util.hash;
+import io.prometheus.client.Gauge;
import lombok.EqualsAndHashCode;
import lombok.Getter;
+import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.PrimeUtil;
import org.slf4j.Logger;
@@ -15,7 +17,9 @@ import static java.lang.Math.round;
*/
public class LongPairHashMap {
private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class);
- private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police
+ private static final Gauge probe_count_metrics
+ = Gauge.build("wmsa_wordfile_hash_map_probe_count", "Probing Count")
+ .register();
private final long hashTableSize;
private final MultimapFileLong data;
@@ -23,37 +27,26 @@ public class LongPairHashMap {
private int sz = 0;
private static final int HEADER_SIZE = 2;
- private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) {
+ public LongPairHashMap(MultimapFileLong data, long size) {
this.data = data;
- this.hashTableSize = hashTableSize;
- this.maxProbeLength = maxProbeLength;
- }
+ // Actually use a prime size for Donald Knuth reasons
+ hashTableSize = PrimeUtil.nextPrime(size, 1);
+ maxProbeLength = hashTableSize / 2;
- public static LongPairHashMap createNew(MultimapFileLong data, long size) {
- var tableSize = PrimeUtil.nextPrime(size, 1);
- var ret = new LongPairHashMap(data, tableSize, tableSize/2);
+ logger.debug("Table size = " + hashTableSize);
- data.put(0, MAGIC_WORD);
- data.put(1, tableSize);
-
- for (int i = 2; i < tableSize; i++) {
+ data.put(0, IndexWordsTable.Strategy.HASH.ordinal());
+ data.put(1, hashTableSize);
+ for (int i = 2; i < hashTableSize; i++) {
data.put(HEADER_SIZE + 2L*i, 0);
}
-
- return ret;
}
+ public LongPairHashMap(MultimapFileLong data) {
+ this.data = data;
+ hashTableSize = data.get(1);
+ maxProbeLength = hashTableSize / 10;
- public static LongPairHashMap loadExisting(MultimapFileLong data) {
- long key = data.get(0);
-
- if (key != MAGIC_WORD) {
- logger.warn("LongPairHashMap lacks magic word, could this be garbage data?");
- }
-
- var hashTableSize = data.get(1);
- var maxProbeLength = hashTableSize / 10;
-
- return new LongPairHashMap(data, hashTableSize, maxProbeLength);
+ logger.debug("Table size = " + hashTableSize);
}
public int size() {
@@ -98,6 +91,8 @@ public class LongPairHashMap {
final var val = getCell(idx);
if (!val.isSet()) {
+ probe_count_metrics.set(j);
+
return setValue(data, idx);
}
else if (val.getKey() == data.getKey()) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
index f381a977..dca8248e 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
@@ -21,7 +21,7 @@ import static java.nio.channels.FileChannel.MapMode.READ_WRITE;
import static nu.marginalia.util.FileSizeUtil.readableSize;
-public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
+public class MultimapFileLong implements AutoCloseable {
private final ArrayList buffers = new ArrayList<>();
private final ArrayList mappedByteBuffers = new ArrayList<>();
@@ -196,12 +196,10 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
}
}
- @Override
public long size() {
return fileLength;
}
- @Override
public void put(long idx, long val) {
if (idx >= mappedSize)
grow(idx);
@@ -216,7 +214,6 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
}
}
- @Override
public long get(long idx) {
if (idx >= mappedSize)
grow(idx);
@@ -232,12 +229,10 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
}
- @Override
public void read(long[] vals, long idx) {
read(vals, vals.length, idx);
}
- @Override
public void read(long[] vals, int n, long idx) {
if (idx+n >= mappedSize) {
grow(idx+n);
@@ -262,12 +257,10 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
}
- @Override
public void write(long[] vals, long idx) {
write(vals, vals.length, idx);
}
- @Override
public void write(long[] vals, int n, long idx) {
if (idx+n >= mappedSize) {
grow(idx+n);
@@ -292,7 +285,6 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
}
- @Override
public void write(LongBuffer vals, long idx) {
int n = vals.limit() - vals.position();
if (idx+n >= mappedSize) {
@@ -318,7 +310,6 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
}
- @Override
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
int length = (int)(sourceEnd - sourceStart);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
deleted file mode 100644
index c2630ddc..00000000
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package nu.marginalia.util.multimap;
-
-import java.io.IOException;
-import java.nio.LongBuffer;
-import java.nio.channels.FileChannel;
-
-public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
- private final long off;
- private final MultimapFileLongSlice map;
-
- public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) {
- this.off = off;
- this.map = map;
- }
-
- @Override
- public long size() {
- return map.size() - off;
- }
-
- @Override
- public void put(long idx, long val) {
- map.put(off+idx, val);
- }
-
- @Override
- public long get(long idx) {
- return map.get(off+idx);
- }
-
- @Override
- public void read(long[] vals, long idx) {
- map.read(vals, idx+off);
- }
-
- @Override
- public void read(long[] vals, int n, long idx) {
- map.read(vals, n, idx+off);
- }
-
- @Override
- public void write(long[] vals, long idx) {
- map.write(vals, idx+off);
- }
-
- @Override
- public void write(long[] vals, int n, long idx) {
- map.write(vals, n, idx+off);
- }
-
- @Override
- public void write(LongBuffer vals, long idx) {
- map.write(vals, idx+off);
- }
-
- @Override
- public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
- throws IOException {
- map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd);
- }
-
- @Override
- public MultimapFileLongSlice atOffset(long off) {
- // If we don't override this, the default implementation would build a pyramid of
- // MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...)))
- // if this is called iteratively (e.g. to walk over a file)
-
- return new MultimapFileLongOffsetSlice(map, this.off + off);
- }
-}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
deleted file mode 100644
index abf29f51..00000000
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
+++ /dev/null
@@ -1,29 +0,0 @@
-package nu.marginalia.util.multimap;
-
-import java.io.IOException;
-import java.nio.LongBuffer;
-import java.nio.channels.FileChannel;
-
-public interface MultimapFileLongSlice {
- long size();
-
- void put(long idx, long val);
-
- long get(long idx);
-
- void read(long[] vals, long idx);
-
- void read(long[] vals, int n, long idx);
-
- void write(long[] vals, long idx);
-
- void write(long[] vals, int n, long idx);
-
- void write(LongBuffer vals, long idx);
-
- void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
-
- default MultimapFileLongSlice atOffset(long off) {
- return new MultimapFileLongOffsetSlice(this, off);
- }
-}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
index 005888d8..c961ac0e 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
@@ -4,9 +4,9 @@ import lombok.experimental.Delegate;
public class MultimapSearcher {
@Delegate
- private final MultimapFileLongSlice mmf;
+ private final MultimapFileLong mmf;
- public MultimapSearcher(MultimapFileLongSlice mmf) {
+ public MultimapSearcher(MultimapFileLong mmf) {
this.mmf = mmf;
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java
index 61dd04c4..6ca4f64f 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java
@@ -13,10 +13,10 @@ import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
public class MultimapSorter {
private final Path tmpFileDir;
private final int internalSortLimit;
- private final MultimapFileLongSlice multimapFileLong;
+ private final MultimapFileLong multimapFileLong;
private final long[] buffer;
- public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) {
+ public MultimapSorter(MultimapFileLong multimapFileLong, Path tmpFileDir, int internalSortLimit) {
this.multimapFileLong = multimapFileLong;
this.tmpFileDir = tmpFileDir;
this.internalSortLimit = internalSortLimit;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java
new file mode 100644
index 00000000..272a1798
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java
@@ -0,0 +1,49 @@
+package nu.marginalia.util.ranking;
+
+import com.zaxxer.hikari.HikariDataSource;
+import gnu.trove.list.TIntList;
+import gnu.trove.list.array.TIntArrayList;
+import gnu.trove.map.hash.TIntIntHashMap;
+import it.unimi.dsi.fastutil.ints.IntArrays;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.sql.SQLException;
+
+public class AcademiaRank {
+ private final TIntArrayList result;
+ private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class);
+
+ public AcademiaRank(HikariDataSource ds, String... origins) throws IOException {
+
+ TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000);
+ TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000);
+
+ for (int i = 0; i < rankingResults.size(); i++) {
+ idToRanking.put(rankingResults.get(i), i);
+ }
+
+ result = new TIntArrayList(10000);
+ try (var conn = ds.getConnection();
+ var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) {
+
+ stmt.setFetchSize(1000);
+ var rsp = stmt.executeQuery();
+ while (rsp.next()) {
+ result.add(rsp.getInt(1));
+ }
+ }
+ catch (SQLException ex) {
+ logger.error("SQL error", ex);
+ }
+
+ int[] internalArray = result.toArray();
+ IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b));
+ result.set(0, internalArray);
+ }
+
+ public TIntArrayList getResult() {
+ return result;
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java
index 7d3b17c4..f2889ad6 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java
@@ -1,11 +1,15 @@
package nu.marginalia.util.ranking;
+import com.zaxxer.hikari.HikariDataSource;
+
+import java.io.IOException;
+
public class BetterReversePageRank extends RankingAlgorithm {
- public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
- super(domains, origins);
+ public BetterReversePageRank(HikariDataSource dataSource, String... origins) {
+ super(dataSource, origins);
}
@Override
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java
index f1f9b0b1..5b64fa73 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java
@@ -1,10 +1,14 @@
package nu.marginalia.util.ranking;
+import com.zaxxer.hikari.HikariDataSource;
+
+import java.io.IOException;
+
public class BetterStandardPageRank extends RankingAlgorithm {
- public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
- super(domains, origins);
+ public BetterStandardPageRank(HikariDataSource dataSource, String... origins) {
+ super(dataSource, origins);
}
@Override
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java
index 485ba353..1e87776c 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java
@@ -1,11 +1,15 @@
package nu.marginalia.util.ranking;
+import com.zaxxer.hikari.HikariDataSource;
+
+import java.io.IOException;
+
public class BuggyReversePageRank extends RankingAlgorithm {
- public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
- super(domains, origins);
+ public BuggyReversePageRank(HikariDataSource dataSource, String... origins) {
+ super(dataSource, origins);
}
@Override
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java
index 836bcdfe..a3d7b87e 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java
@@ -1,10 +1,14 @@
package nu.marginalia.util.ranking;
+import com.zaxxer.hikari.HikariDataSource;
+
+import java.io.IOException;
+
public class BuggyStandardPageRank extends RankingAlgorithm {
- public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
- super(domains, origins);
+ public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) {
+ super(dataSource, origins);
}
@Override
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java
index 4d255087..fd76989c 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java
@@ -1,129 +1,224 @@
package nu.marginalia.util.ranking;
+import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
+import gnu.trove.set.hash.TIntHashSet;
import it.unimi.dsi.fastutil.ints.IntComparator;
+import lombok.AllArgsConstructor;
+import lombok.Data;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
+import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
+import java.sql.SQLException;
import java.util.*;
import java.util.function.IntToDoubleFunction;
import java.util.stream.IntStream;
import it.unimi.dsi.fastutil.ints.IntArrays;
public abstract class RankingAlgorithm {
- protected final TIntObjectHashMap domainsById = new TIntObjectHashMap<>();
- protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
- protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
+ final TIntObjectHashMap domainsById = new TIntObjectHashMap<>();
+ final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
+ final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
- protected TIntArrayList[] linkDataSrc2Dest;
- protected TIntArrayList[] linkDataDest2Src;
+ private final TIntHashSet spamDomains;
+ private final HikariDataSource dataSource;
+
+ TIntArrayList[] linkDataSrc2Dest;
+ TIntArrayList[] linkDataDest2Src;
public final Set originDomains = new HashSet<>();
public final Set originDomainIds = new HashSet<>();
private int maxKnownUrls = Integer.MAX_VALUE;
+ private static final boolean getNames = true;
+
private final Logger logger = LoggerFactory.getLogger(getClass());
- private final RankingDomainFetcher domains;
+ public static void main(String... args) throws IOException {
+ var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com");
+ var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
- public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
- this.domains = domains;
-
- originDomains.addAll(Arrays.asList(origins));
-
- domains.getDomains(domainData -> {
- int id = domainData.id;
-
- domainsById.put(id, domainData);
-
- domainIndexToId.put(domainIndexToId.size(), id);
- domainIdToIndex.put(id, domainIdToIndex.size());
+ var rankVector = spr.pageRankVector();
+ var norm = rankVector.norm();
+ rpr.pageRank(i -> rankVector.get(i) / norm, 25).forEach(i -> {
+ System.out.println(spr.domainNameFromId(i));
+ return true;
});
-
- linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
- linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
-
- domains.eachDomainLink((src, dst) -> {
- if (src == dst) return;
-
- if (domainsById.contains(src) && domainsById.contains(dst)) {
-
- int srcIdx = domainIdToIndex.get(src);
- int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
-
- if (linkDataSrc2Dest[srcIdx] == null) {
- linkDataSrc2Dest[srcIdx] = new TIntArrayList();
- }
- linkDataSrc2Dest[srcIdx].add(dstIdx);
-
- if (linkDataDest2Src[dstIdx] == null) {
- linkDataDest2Src[dstIdx] = new TIntArrayList();
- }
- linkDataDest2Src[dstIdx].add(srcIdx);
- }
- });
-
- for (var namePattern : this.originDomains) {
- domains.domainsByPattern(namePattern, i -> {
- int ival = domainIdToIndex.get(i);
- if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
- originDomainIds.add(ival);
- }
- else {
- logger.debug("No value for {}", i);
- }
- });
- }
- logger.info("Origin Domains: {}", originDomainIds.size());
}
- public void addPeripheralNodes() {
+ public String domainNameFromId(int id) {
+ return domainsById.get(id).name;
+ }
+ public boolean isPeripheral(int id) {
+ return domainsById.get(id).peripheral;
+ }
+
+ public RankingAlgorithm(HikariDataSource dataSource, String... origins) {
+ this.dataSource = dataSource;
+ var blacklist = new EdgeDomainBlacklistImpl(dataSource);
+
+ spamDomains = blacklist.getSpamDomains();
+ originDomains.addAll(Arrays.asList(origins));
+
+ try (var conn = dataSource.getConnection()) {
+
+ String s;
+ if (getNames) {
+ s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
+ }
+ else {
+ s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
+ }
+ try (var stmt = conn.prepareStatement(s)) {
+ stmt.setFetchSize(10000);
+ var rsp = stmt.executeQuery();
+ while (rsp.next()) {
+ int id = rsp.getInt(1);
+ if (!spamDomains.contains(id)) {
+
+ domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false));
+
+ domainIndexToId.put(domainIndexToId.size(), id);
+ domainIdToIndex.put(id, domainIdToIndex.size());
+ }
+ }
+ }
+
+
+ linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
+ linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
+
+ try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
+ stmt.setFetchSize(10000);
+
+ var rsp = stmt.executeQuery();
+
+ while (rsp.next()) {
+ int src = rsp.getInt(1);
+ int dst = rsp.getInt(2);
+
+ if (src == dst) continue;
+
+ if (domainsById.contains(src) && domainsById.contains(dst)) {
+
+ int srcIdx = domainIdToIndex.get(src);
+ int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
+
+ if (linkDataSrc2Dest[srcIdx] == null) {
+ linkDataSrc2Dest[srcIdx] = new TIntArrayList();
+ }
+ linkDataSrc2Dest[srcIdx].add(dstIdx);
+
+ if (linkDataDest2Src[dstIdx] == null) {
+ linkDataDest2Src[dstIdx] = new TIntArrayList();
+ }
+ linkDataDest2Src[dstIdx].add(srcIdx);
+ }
+ }
+ }
+
+ try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) {
+ for (var seed : this.originDomains) {
+ stmt.setString(1, seed);
+ var rsp = stmt.executeQuery();
+ while (rsp.next()) {
+ int i = rsp.getInt(1);
+ int ival = domainIdToIndex.get(i);
+ if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
+ originDomainIds.add(ival);
+ }
+ else {
+ logger.debug("No value for {}", i);
+ }
+ }
+ logger.debug("{} -> {}", seed, originDomainIds.size());
+ }
+ }
+
+ logger.info("Origin Domains: {}", originDomainIds.size());
+
+ } catch (SQLException throwables) {
+ logger.error("SQL error", throwables);
+ }
+ }
+
+ public void addPeripheralNodes(boolean includeErrorStates) {
int newNodesIdxCutoff = domainIdToIndex.size();
logger.info("Inserting peripheral nodes");
- domains.getPeripheralDomains(domainData -> {
- int id = domainData.id;
-
- if (domainsById.put(id, domainData) == null) { // true if id was not already present
- domainIndexToId.put(domainIndexToId.size(), id);
- domainIdToIndex.put(id, domainIdToIndex.size());
+ try (var conn = dataSource.getConnection()) {
+ String s;
+ if (getNames) {
+ s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
- });
-
- linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
- linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
-
- domains.eachDomainLink((src, dst) -> {
- if (src == dst) return;
-
- if (domainsById.contains(src) && domainsById.contains(dst)) {
- int srcIdx = domainIdToIndex.get(src);
- int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
-
- // This looks like a bug, but it improves the results
- if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
- return;
-
- if (linkDataSrc2Dest[srcIdx] == null) {
- linkDataSrc2Dest[srcIdx] = new TIntArrayList();
- }
- linkDataSrc2Dest[srcIdx].add(dstIdx);
-
- if (linkDataDest2Src[dstIdx] == null) {
- linkDataDest2Src[dstIdx] = new TIntArrayList();
- }
- linkDataDest2Src[dstIdx].add(srcIdx);
+ else {
+ s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
- });
+ try (var stmt = conn.prepareStatement(s)) {
+ stmt.setFetchSize(10000);
+ var rsp = stmt.executeQuery();
+
+ while (rsp.next()) {
+ int id = rsp.getInt(1);
+
+ if (!spamDomains.contains(id)) {
+ domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true));
+
+ domainIndexToId.put(domainIndexToId.size(), id);
+ domainIdToIndex.put(id, domainIdToIndex.size());
+ }
+ }
+
+ }
+
+ linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
+ linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
+
+ try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
+ stmt.setFetchSize(10000);
+
+ var rsp = stmt.executeQuery();
+
+ while (rsp.next()) {
+ int src = rsp.getInt(1);
+ int dst = rsp.getInt(2);
+
+ if (src == dst) continue;
+
+ if (domainsById.contains(src) && domainsById.contains(dst)) {
+
+ int srcIdx = domainIdToIndex.get(src);
+ int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
+
+ // This looks like a bug, but it improves the results
+ if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
+ continue;
+
+ if (linkDataSrc2Dest[srcIdx] == null) {
+ linkDataSrc2Dest[srcIdx] = new TIntArrayList();
+ }
+ linkDataSrc2Dest[srcIdx].add(dstIdx);
+
+ if (linkDataDest2Src[dstIdx] == null) {
+ linkDataDest2Src[dstIdx] = new TIntArrayList();
+ }
+ linkDataDest2Src[dstIdx].add(srcIdx);
+ }
+ }
+ }
+ } catch (SQLException throwables) {
+ logger.error("SQL error", throwables);
+ }
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
}
@@ -176,14 +271,14 @@ public abstract class RankingAlgorithm {
return rank.getRanking(resultCount);
}
- public TIntList pageRankWithPeripheralNodes(int resultCount) {
+ public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
for (int i = 0; i < iter_max; i++) {
if (i == iter_max-1) {
- addPeripheralNodes();
+ addPeripheralNodes(includeErrorStates);
}
RankVector newRank = createNewRankVector(rank);
@@ -228,7 +323,7 @@ public abstract class RankingAlgorithm {
abstract RankVector createNewRankVector(RankVector rank);
- public boolean includeInRanking(RankingDomainData data) {
+ public boolean includeInRanking(DomainData data) {
if (data.isAlias())
return false;
if (data.isSpecial())
@@ -350,4 +445,32 @@ public abstract class RankingAlgorithm {
}
}
+ @Data
+ @AllArgsConstructor
+ static class DomainData {
+ public final int id;
+ public final String name;
+ private int alias;
+ private int state;
+ public final int knownUrls;
+ public boolean peripheral;
+
+ public int resolveAlias() {
+ if (alias == 0) return id;
+ return alias;
+ }
+
+ public boolean isAlias() {
+ return alias != 0;
+ }
+
+ public boolean isSpecial() {
+ return EdgeDomainIndexingState.SPECIAL.code == state;
+ }
+
+ public boolean isSocialMedia() {
+ return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state;
+ }
+ }
+
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java
deleted file mode 100644
index c29ed704..00000000
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java
+++ /dev/null
@@ -1,33 +0,0 @@
-package nu.marginalia.util.ranking;
-
-import lombok.AllArgsConstructor;
-import lombok.Data;
-import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
-
-@Data
-@AllArgsConstructor
-class RankingDomainData {
- public final int id;
- public final String name;
- private int alias;
- private EdgeDomainIndexingState state;
- public final int knownUrls;
- public boolean peripheral;
-
- public int resolveAlias() {
- if (alias == 0) return id;
- return alias;
- }
-
- public boolean isAlias() {
- return alias != 0;
- }
-
- public boolean isSpecial() {
- return EdgeDomainIndexingState.SPECIAL == state;
- }
-
- public boolean isSocialMedia() {
- return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
- }
-}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java
deleted file mode 100644
index 79285a83..00000000
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java
+++ /dev/null
@@ -1,105 +0,0 @@
-package nu.marginalia.util.ranking;
-
-import com.google.inject.Inject;
-import com.zaxxer.hikari.HikariDataSource;
-import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
-import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.sql.SQLException;
-import java.util.function.Consumer;
-import java.util.function.IntConsumer;
-
-public class RankingDomainFetcher {
- private final HikariDataSource dataSource;
- private final EdgeDomainBlacklistImpl blacklist;
- private final Logger logger = LoggerFactory.getLogger(getClass());
-
- private final boolean getNames = false;
-
- @Inject
- public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
- this.dataSource = dataSource;
- this.blacklist = blacklist;
- }
-
- public void getDomains(Consumer consumer) {
- String query;
- if (getNames) {
- query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
- }
- else {
- query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
- }
-
- getDomains(query, consumer);
- }
-
-
- public void getPeripheralDomains(Consumer consumer) {
- String query;
- if (getNames) {
- query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
- }
- else {
- query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
- }
-
- getDomains(query, consumer);
- }
-
- private void getDomains(String query, Consumer consumer) {
- try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
- stmt.setFetchSize(10000);
- var rsp = stmt.executeQuery();
- while (rsp.next()) {
- int id = rsp.getInt(1);
- if (!blacklist.isBlacklisted(id)) {
- consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
- }
- }
- }
- catch (SQLException ex) {
- logger.error("Failed to fetch domains", ex);
- }
- }
-
- public void eachDomainLink(DomainLinkConsumer consumer) {
- try (var conn = dataSource.getConnection();
- var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"))
- {
- stmt.setFetchSize(10000);
-
- var rsp = stmt.executeQuery();
-
- while (rsp.next()) {
- int src = rsp.getInt(1);
- int dst = rsp.getInt(2);
-
- consumer.accept(src, dst);
- }
- }
- catch (SQLException ex) {
- logger.error("Failed to fetch domain links", ex);
- }
- }
-
- public void domainsByPattern(String pattern, IntConsumer idConsumer) {
- try (var conn = dataSource.getConnection();
- var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
- stmt.setString(1, pattern);
- var rsp = stmt.executeQuery();
- while (rsp.next()) {
- idConsumer.accept(rsp.getInt(1));
- }
- }
- catch (SQLException ex) {
- logger.error("Failed to fetch domains by pattern", ex);
- }
- }
-
- public interface DomainLinkConsumer {
- void accept(int from, int to);
- }
-}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java
index 02823563..6a214278 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java
@@ -66,7 +66,7 @@ public class OldReversePageRankV2 {
originDomains.add("memex.marginalia.nu");
try (var conn = dataSource.getConnection()) {
- try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) {
+ try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@@ -90,7 +90,7 @@ public class OldReversePageRankV2 {
}
}
- try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
+ try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setFetchSize(10000);
for (var seed : this.originDomains) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java
index 74bef70a..c42b28dd 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java
@@ -48,7 +48,7 @@ public class StandardPageRank {
originDomains.addAll(Arrays.asList(origins));
try (var conn = dataSource.getConnection()) {
- try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) {
+ try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@@ -78,7 +78,7 @@ public class StandardPageRank {
}
}
- try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
+ try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
for (var seed : this.originDomains) {
stmt.setString(1, seed);
var rsp = stmt.executeQuery();
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java
index d6f95f51..a5ea8b06 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java
@@ -50,7 +50,7 @@ public class DedupTool {
Map>> domainToHashToUrl = new HashMap<>();
try (var conn = ds.getConnection();
- var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
+ var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java
index 3f3ce6a5..85a691c2 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java
@@ -112,10 +112,10 @@ public class PerusePageRankV2 {
try (var conn = dataSource.getConnection()) {
String s;
if (getNames) {
- s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
+ s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
}
else {
- s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
+ s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java
new file mode 100644
index 00000000..38192b35
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java
@@ -0,0 +1,30 @@
+package nu.marginalia.util.ranking.tool;
+
+import lombok.SneakyThrows;
+import nu.marginalia.util.ranking.AcademiaRank;
+import nu.marginalia.wmsa.configuration.module.DatabaseModule;
+import org.mariadb.jdbc.Driver;
+
+import java.io.IOException;
+
+public class TestAcademiaRankTool {
+
+ @SneakyThrows
+ public static void main(String... args) {
+ Driver driver = new Driver();
+ var conn = new DatabaseModule().provideConnection();
+
+ var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu");
+ var res = rank.getResult();
+
+ try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
+ for (int i = 0; i < Math.min(res.size(), 100); i++) {
+ stmt.setInt(1, res.getQuick(i));
+ var rsp = stmt.executeQuery();
+ while (rsp.next())
+ System.out.println(rsp.getString(1));
+ }
+ }
+ }
+
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java
index f80d307f..71ec72a6 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java
@@ -3,13 +3,12 @@ package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
-import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
-import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
@@ -44,14 +43,12 @@ public class UpdateDomainRanksTool {
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking");
- var ds = new DatabaseModule().provideConnection();
- var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
- var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
+ var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu");
rankMax = spr.size()*2;
uploader.start();
- spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
+ spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
@@ -86,6 +83,11 @@ public class UpdateDomainRanksTool {
}
}
+ logger.info("Recalculating quality");
+ try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
+ stmt.executeUpdate();
+ }
+
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java
index f46fb390..336b35fd 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java
@@ -3,13 +3,12 @@ package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
-import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
-import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
@@ -46,9 +45,7 @@ public class UpdateDomainRanksTool2 {
logger.info("Ranking");
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
- var ds = new DatabaseModule().provideConnection();
- var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
- var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
+ var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
@@ -61,7 +58,7 @@ public class UpdateDomainRanksTool2 {
rankMax = rpr.size();
- rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
+ rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
@@ -97,6 +94,9 @@ public class UpdateDomainRanksTool2 {
}
logger.info("Recalculating quality");
+ try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
+ stmt.executeUpdate();
+ }
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java
index 55648dfd..050152bc 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java
@@ -29,7 +29,7 @@ public class ReindexTriggerMain {
.build();
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
- var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
+ var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
while (rs.next()) {
System.out.printf("%d %s %s %d\n",
rs.getInt(1),
@@ -38,7 +38,7 @@ public class ReindexTriggerMain {
rs.getInt(4));
}
- rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100");
+ rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100");
while (rs.next()) {
System.out.printf("%d %d %s %d %s\n",
rs.getInt(1),
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java
index c0698dde..8755716c 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java
@@ -14,7 +14,7 @@ public interface Interpreter {
void loadRssFeed(EdgeUrl[] rssFeed);
void loadDomainLink(DomainLink[] links);
- void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip);
+ void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality);
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java
index 2b1fd631..065d6211 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java
@@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
-public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction {
+public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction {
@Override
public void apply(Interpreter interpreter) {
- interpreter.loadProcessedDomain(domain, state, ip);
+ interpreter.loadProcessedDomain(domain, state, quality);
}
@Override
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java
index 49a39457..140a762a 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java
@@ -76,9 +76,9 @@ public class Loader implements Interpreter {
}
@Override
- public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
- logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
- sqlLoadProcessedDomain.load(data, domain, state, ip);
+ public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
+ logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality);
+ sqlLoadProcessedDomain.load(data, domain, state, quality);
}
@Override
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java
index 6750bd33..e0978828 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java
@@ -30,7 +30,7 @@ public class SqlLoadDomainLinks {
INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
SELECT SOURCE.ID,DEST.ID
FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST
- ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN;
+ ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN;
END
""");
}
@@ -61,8 +61,8 @@ public class SqlLoadDomainLinks {
}
}
}
- catch (SQLException ex) {
- logger.warn("SQL error inserting domain links", ex);
+ catch (SQLException sql) {
+ sql.printStackTrace();
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java
index 76a839c9..18cc40bd 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java
@@ -25,9 +25,15 @@ public class SqlLoadDomains {
stmt.execute("""
CREATE PROCEDURE INSERT_DOMAIN (
IN DOMAIN_NAME VARCHAR(255),
+ IN SUB_DOMAIN VARCHAR(255),
IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci)
BEGIN
- INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN);
+ INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN);
+
+ INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID)
+ SELECT DOMAIN_NAME,SUB_DOMAIN,ID
+ FROM EC_TOP_DOMAIN
+ WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN;
END
""");
}
@@ -40,9 +46,10 @@ public class SqlLoadDomains {
public void load(LoaderData data, EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
- try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
+ try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
insertCall.setString(1, domain.toString());
- insertCall.setString(2, domain.domain);
+ insertCall.setString(2, domain.subDomain);
+ insertCall.setString(3, domain.domain);
insertCall.addBatch();
var ret = insertCall.executeUpdate();
@@ -50,11 +57,12 @@ public class SqlLoadDomains {
logger.warn("load({}) -- bad row count {}", domain, ret);
}
+ connection.commit();
findIdForTargetDomain(connection, data);
}
}
catch (SQLException ex) {
- logger.warn("SQL error inserting domain", ex);
+ ex.printStackTrace();
}
@@ -65,11 +73,12 @@ public class SqlLoadDomains {
try (var connection = dataSource.getConnection()) {
connection.setAutoCommit(false);
- try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
+ try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
for (var domain : domains) {
insertCall.setString(1, domain.toString());
- insertCall.setString(2, domain.domain);
+ insertCall.setString(2, domain.subDomain);
+ insertCall.setString(3, domain.domain);
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@@ -86,7 +95,7 @@ public class SqlLoadDomains {
findIdForTargetDomain(connection, data);
}
catch (SQLException ex) {
- logger.warn("SQL error inserting domains", ex);
+ ex.printStackTrace();
}
}
@@ -95,7 +104,7 @@ public class SqlLoadDomains {
return;
}
- try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
+ try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?"))
{
var targetDomain = data.getTargetDomain();
@@ -109,7 +118,7 @@ public class SqlLoadDomains {
}
}
catch (SQLException ex) {
- logger.warn("SQL error finding id for domain", ex);
+ ex.printStackTrace();
}
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java
index e2e25fff..b25a657b 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java
@@ -31,14 +31,14 @@ public class SqlLoadProcessedDocument {
IN TITLE VARCHAR(255),
IN DESCRIPTION VARCHAR(255),
IN LENGTH INT,
+ IN QUALITY_MEASURE DOUBLE,
IN FEATURES INT,
IN STANDARD VARCHAR(32),
- IN QUALITY DOUBLE,
IN HASH INT)
BEGIN
SET FOREIGN_KEY_CHECKS=0;
- REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY);
- UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
+ REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES);
+ UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID;
SET FOREIGN_KEY_CHECKS=1;
END
""");
@@ -47,8 +47,7 @@ public class SqlLoadProcessedDocument {
IN URL_ID INT,
IN STATE VARCHAR(32))
BEGIN
- UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
- DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID;
+ UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID;
END
""");
@@ -62,7 +61,6 @@ public class SqlLoadProcessedDocument {
public void load(LoaderData data, List documents) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
- conn.setAutoCommit(false);
for (var doc : documents) {
int urlId = data.getUrlId(doc.url());
@@ -76,9 +74,9 @@ public class SqlLoadProcessedDocument {
stmt.setString(3, doc.title());
stmt.setString(4, doc.description());
stmt.setInt(5, doc.length());
- stmt.setInt(6, doc.htmlFeatures());
- stmt.setString(7, doc.standard().name());
- stmt.setDouble(8, doc.quality());
+ stmt.setDouble(6, doc.quality());
+ stmt.setInt(7, doc.htmlFeatures());
+ stmt.setString(8, doc.standard().name());
stmt.setInt(9, (int) doc.hash());
stmt.addBatch();
}
@@ -91,8 +89,8 @@ public class SqlLoadProcessedDocument {
}
conn.commit();
- } catch (SQLException ex) {
- logger.warn("SQL error inserting document", ex);
+ } catch (SQLException e) {
+ e.printStackTrace();
}
@@ -119,8 +117,8 @@ public class SqlLoadProcessedDocument {
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
}
}
- } catch (SQLException ex) {
- logger.warn("SQL error inserting failed document", ex);
+ } catch (SQLException e) {
+ e.printStackTrace();
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java
index 018d76c9..64607b3a 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java
@@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain {
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
stmt.execute("""
CREATE PROCEDURE INITIALIZE_DOMAIN (
- IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'),
+ IN ST INT,
IN IDX INT,
- IN DID INT,
- IN IP VARCHAR(32))
+ IN QUAL DOUBLE,
+ IN DID INT)
BEGIN
- UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID;
+ UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID;
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
END
""");
@@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain {
}
}
- public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
+ public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
data.setTargetDomain(domain);
loadDomains.load(data, domain);
@@ -49,17 +49,18 @@ public class SqlLoadProcessedDomain {
try (var conn = dataSource.getConnection();
var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)"))
{
- initCall.setString(1, state.name());
+ initCall.setInt(1, state.code);
initCall.setInt(2, 1 + data.sizeHint / 100);
- initCall.setInt(3, data.getDomainId(domain));
- initCall.setString(4, ip);
+ initCall.setDouble(3, quality);
+ initCall.setInt(4, data.getDomainId(domain));
int rc = initCall.executeUpdate();
if (rc < 1) {
- logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
+ logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc);
}
+ conn.commit();
}
catch (SQLException ex) {
- logger.warn("SQL error initializing domain", ex);
+ ex.printStackTrace();
}
}
@@ -68,9 +69,9 @@ public class SqlLoadProcessedDomain {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
UPDATE EC_DOMAIN TARGET
- INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=?
+ INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=?
SET TARGET.DOMAIN_ALIAS=ALIAS.ID
- WHERE TARGET.DOMAIN_NAME=?
+ WHERE TARGET.URL_PART=?
""")) {
stmt.setString(1, link.to().toString());
stmt.setString(2, link.from().toString());
@@ -80,7 +81,7 @@ public class SqlLoadProcessedDomain {
}
}
catch (SQLException ex) {
- logger.warn("SQL error inserting domain alias", ex);
+ ex.printStackTrace();
}
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
index ba9ae43a..7d8851ca 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
@@ -1,13 +1,11 @@
package nu.marginalia.wmsa.edge.converting.loader;
-import com.google.common.hash.Hashing;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.nio.charset.StandardCharsets;
import java.sql.SQLException;
import java.sql.Types;
@@ -27,13 +25,12 @@ public class SqlLoadUrls {
stmt.execute("""
CREATE PROCEDURE INSERT_URL (
IN PROTO VARCHAR(255),
- IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
+ IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT,
- IN PATH VARCHAR(255),
- IN PATH_HASH BIGINT
+ IN URL VARCHAR(255)
)
BEGIN
- INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
+ INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME;
END
""");
}
@@ -45,8 +42,8 @@ public class SqlLoadUrls {
public void load(LoaderData data, EdgeUrl[] urls) {
try (var conn = dataSource.getConnection();
- var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)");
- var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?")
+ var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)");
+ var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?")
)
{
conn.setAutoCommit(false);
@@ -61,7 +58,6 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER);
}
insertCall.setString(4, url.path);
- insertCall.setLong(5, hashPath(url.path));
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@@ -90,11 +86,7 @@ public class SqlLoadUrls {
}
catch (SQLException ex) {
- logger.warn("SQL error inserting URLs", ex);
+ ex.printStackTrace();
}
}
-
- private long hashPath(String path) {
- return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong();
- }
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java
index b75de436..d36cb830 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java
@@ -15,7 +15,7 @@ public class InstructionsCompiler {
public List compile(ProcessedDomain domain) {
List ret = new ArrayList<>(domain.size()*4);
- ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
+ ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.)));
if (domain.documents != null) {
compileUrls(ret, domain.documents);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java
index 52fe338a..2f25d6d7 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java
@@ -34,10 +34,11 @@ public class CrawlJobExtractorMain {
private static final String domainsSql =
"""
- SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME)
+ SELECT ID, LOWER(EC_DOMAIN.URL_PART)
FROM EC_DOMAIN
- WHERE INDEXED>0
- AND STATE='ACTIVE' OR STATE='EXHAUSTED'
+ WHERE QUALITY_RAW>-100
+ AND INDEXED>0
+ AND STATE<2
ORDER BY
INDEX_DATE ASC,
DISCOVER_DATE ASC,
@@ -48,8 +49,8 @@ public class CrawlJobExtractorMain {
private static final String urlsSql =
"""
- SELECT URL
- FROM EC_URL_VIEW
+ SELECT CONCAT(PROTO, "://", ?, URL)
+ FROM EC_URL
WHERE DOMAIN_ID=?
ORDER BY
VISITED DESC,
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java
index ea1946fc..21935fd0 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java
@@ -6,7 +6,6 @@ import com.google.common.hash.Hashing;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.zaxxer.hikari.HikariDataSource;
-import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
@@ -31,19 +30,19 @@ public class CrawlJobExtractorPageRankMain {
"""
SELECT ID
FROM EC_DOMAIN
- WHERE DOMAIN_NAME=?
+ WHERE URL_PART=?
""";
private static final String specificDomainSqlFromId =
"""
- SELECT LOWER(DOMAIN_NAME)
+ SELECT LOWER(URL_PART)
FROM EC_DOMAIN
WHERE ID=?
""";
private static final String urlsSql =
"""
- SELECT URL
- FROM EC_URL_VIEW
+ SELECT CONCAT(PROTO, "://", ?, URL)
+ FROM EC_URL
WHERE DOMAIN_ID=?
ORDER BY
VISITED DESC,
@@ -74,12 +73,10 @@ public class CrawlJobExtractorPageRankMain {
Gson gson = new GsonBuilder().create();
- var ds = new DatabaseModule().provideConnection();
- var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
- var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
+ var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
rpr.setMaxKnownUrls(750);
- var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size());
+ var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false);
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection());
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java
index 2f309b07..81e8dd58 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java
@@ -13,14 +13,44 @@ import java.util.Optional;
@ImplementedBy(EdgeDataStoreDaoImpl.class)
public interface EdgeDataStoreDao {
+ boolean isBlacklisted(EdgeDomain domain);
+
EdgeId getDomainId(EdgeDomain domain);
+ EdgeId getUrlId(EdgeUrl domain);
+ EdgeUrl getUrl(EdgeId id);
+ EdgeUrlDetails getUrlDetails(EdgeId id);
+ List getDomainNeighbors(EdgeId domainId, EdgeDomainBlacklist backlist, int count);
List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist backlist, int count);
-
List getRandomDomains(int count, EdgeDomainBlacklist backlist);
List getUrlDetailsMulti(List> ids);
+ List> getDomainIdsFromUrlIds(Collection> urlIds);
+
EdgeDomain getDomain(EdgeId id);
+ List> inboudUrls(EdgeId id, int limit);
+ List> outboundUrls(EdgeId id, int limit);
+ Optional> resolveAmbiguousDomain(String name);
+
+
+ int getPagesKnown(EdgeId domainId);
+ int getPagesVisited(EdgeId domainId);
+ int getPagesIndexed(EdgeId domainId);
+
+ int getIncomingLinks(EdgeId domainId);
+ int getOutboundLinks(EdgeId domainId);
+
+ double getDomainQuality(EdgeId domainId);
+
+ EdgeDomainIndexingState getDomainState(EdgeId domainId);
+
+ List getLinkingDomains(EdgeId domainId);
+
+ List getNewUrls(EdgeId domainId, Collection links);
+
+ double getRank(EdgeId domainId);
+
+ void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed);
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
index 30ea2256..a214bb15 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
@@ -33,6 +33,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
private final Cache> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
private final Cache> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
+ private static final String DEFAULT_PROTOCOL = "http";
public static double QUALITY_LOWER_BOUND_CUTOFF = -15.;
@Inject
public EdgeDataStoreDaoImpl(HikariDataSource dataSource)
@@ -47,13 +48,30 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
domainIdCache.invalidateAll();
}
+ @SneakyThrows
+ @Override
+ public boolean isBlacklisted(EdgeDomain domain) {
+
+ try (var connection = dataSource.getConnection()) {
+ try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
+ stmt.setString(1, domain.domain);
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+
@SneakyThrows
@Override
public EdgeId getDomainId(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
return domainIdCache.get(domain, () -> {
- try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
+ try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
@@ -68,14 +86,104 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
- private String idList(List> ids) {
- StringJoiner j = new StringJoiner(",", "(", ")");
- for (var id : ids) {
- j.add(Integer.toString(id.getId()));
+ @Override
+ @SneakyThrows
+ public EdgeId getUrlId(EdgeUrl url) {
+ try (var connection = dataSource.getConnection()) {
+
+ return urlIdCache.get(url, () -> {
+ try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) {
+ stmt.setString(1, url.path);
+ stmt.setString(2, url.domain.toString());
+ stmt.setString(3, url.proto);
+
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return new EdgeId<>(rsp.getInt(1));
+ }
+ }
+ // Lenient mode for http->https upgrades etc
+ try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) {
+ stmt.setString(1, url.path);
+ stmt.setString(2, url.domain.toString());
+
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return new EdgeId<>(rsp.getInt(1));
+ }
+ }
+ throw new NoSuchElementException(url.toString());
+ });
+ }
+ catch (UncheckedExecutionException ex) {
+ throw ex.getCause();
}
- return j.toString();
}
+
+ @SneakyThrows
+ @Override
+ public List> getDomainIdsFromUrlIds(Collection> urlIds) {
+ List> results = new ArrayList<>(urlIds.size());
+
+ if (urlIds.isEmpty())
+ return results;
+
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds
+ .stream()
+ .map(EdgeId::getId)
+ .map(Object::toString)
+ .collect(Collectors.joining(",", "(", ")"))))
+ {
+ var rsp = stmt.executeQuery();
+ while (rsp.next()) {
+ results.add(new EdgeId<>(rsp.getInt(1)));
+ }
+
+ }
+ }
+
+ return results;
+ }
+
+ static final Pattern badChars = Pattern.compile("[';\\\\]");
+ private String saneString(String s) {
+ return "\'"+badChars.matcher(s).replaceAll("?")+"\'";
+ }
+ @SneakyThrows
+ @Override
+ public EdgeUrl getUrl(EdgeId id) {
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt = connection.createStatement()) {
+ var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId());
+ if (rsp.next()) {
+ return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4));
+ }
+ throw new NoSuchElementException();
+ }
+ }
+ }
+
+ @SneakyThrows
+ @Override
+ public EdgeUrlDetails getUrlDetails(EdgeId id) {
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt = connection.createStatement()) {
+ var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId());
+ if (rsp.next()) {
+ EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
+ return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
+ }
+ throw new NoSuchElementException();
+ }
+ }
+ }
+
+
@SneakyThrows
@Override
public List getUrlDetailsMulti(List> ids) {
@@ -85,39 +193,16 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
List result = new ArrayList<>(ids.size());
try (var connection = dataSource.getConnection()) {
+ // This is SQL-injection safe, the IDs are of type int
+ String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")"));
- String idString = idList(ids);
-
- try (var stmt = connection.prepareStatement(
- """
- SELECT ID, URL,
- TITLE, DESCRIPTION,
- QUALITY,
- WORDS_TOTAL, FORMAT, FEATURES,
- IP, DOMAIN_STATE,
- DATA_HASH
- FROM EC_URL_VIEW WHERE ID IN
- """ + idString)) {
+ try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
stmt.setFetchSize(ids.size());
var rsp = stmt.executeQuery();
while (rsp.next()) {
- EdgeUrl url = new EdgeUrl(rsp.getString(2));
- var val = new EdgeUrlDetails(rsp.getInt(1), url,
- rsp.getString(3), // title
- rsp.getString(4), // description
- rsp.getDouble(5), // quality
- rsp.getInt(6), // wordsTotal
- rsp.getString(7), // format
- rsp.getInt(8), // features
- rsp.getString(9), // ip
- EdgeDomainIndexingState.valueOf(rsp.getString(10)), // domainState
- rsp.getInt(11), // dataHash
- EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
- Integer.MAX_VALUE, // rankingId
- Double.MAX_VALUE, // termScore
- 0 // queryLength
- );
+ EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
+ var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
result.add(val);
}
@@ -129,13 +214,82 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
return result;
}
+ @Override
+ public List getDomainNeighbors(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) {
+ final Set domains = new HashSet<>(count*3);
+
+ final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?";
+
+ try (var connection = dataSource.getConnection()) {
+ try (var stmt = connection.prepareStatement(q)) {
+ stmt.setFetchSize(count);
+ stmt.setInt(1, domainId.getId());
+ stmt.setInt(2, count);
+ var rsp = stmt.executeQuery();
+ while (rsp.next()) {
+ int id = rsp.getInt(1);
+ String domain = rsp.getString(2);
+
+ if (!blacklist.isBlacklisted(id)) {
+ var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
+
+ domains.add(new BrowseResult(url, id));
+ }
+ }
+ }
+
+ final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
+ try (var stmt = connection.prepareStatement(q2)) {
+
+ stmt.setFetchSize(count);
+ stmt.setInt(1, domainId.getId());
+ stmt.setInt(2, count);
+ var rsp = stmt.executeQuery();
+ while (rsp.next()) {
+ int id = rsp.getInt(1);
+ String domain = rsp.getString(2);
+
+ if (!blacklist.isBlacklisted(id)) {
+ var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
+
+ domains.add(new BrowseResult(url, id));
+ }
+ }
+ }
+
+ final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
+ try (var stmt = connection.prepareStatement(q3)) {
+ stmt.setFetchSize(count);
+ stmt.setInt(1, domainId.getId());
+ stmt.setInt(2, count);
+
+ var rsp = stmt.executeQuery();
+ while (rsp.next()) {
+ int id = rsp.getInt(1);
+ String domain = rsp.getString(2);
+
+ if (!blacklist.isBlacklisted(id)) {
+ var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
+
+ domains.add(new BrowseResult(url, id));
+ }
+ }
+ }
+ } catch (SQLException throwables) {
+ throwables.printStackTrace();
+ }
+
+
+ return new ArrayList<>(domains);
+ }
+
@Override
public List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) {
final Set domains = new HashSet<>(count*3);
final String q = """
- SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT
+ SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART, COUNT(*) AS CNT
FROM EC_DOMAIN_NEIGHBORS
INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
@@ -162,14 +316,16 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
- domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
+ var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
+
+ domains.add(new BrowseResult(url, id));
}
}
}
if (domains.size() < count/2) {
final String q2 = """
- SELECT EC_DOMAIN.ID, DOMAIN_NAME
+ SELECT EC_DOMAIN.ID, URL_PART
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID
@@ -191,7 +347,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
- domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
+ var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
+
+ domains.add(new BrowseResult(url, id));
}
}
}
@@ -199,11 +357,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
if (domains.size() < count/2) {
final String q3 = """
- SELECT EC_DOMAIN.ID, DOMAIN_NAME
- FROM EC_DOMAIN
- INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
+ SELECT EC_DOMAIN.ID, URL_PART
+ FROM EC_DOMAIN
+ INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
- INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID
+ INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID
WHERE B.DEST_DOMAIN_ID=?
AND STATE<2
AND KNOWN_URLS<1000
@@ -223,7 +381,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
- domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
+ var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
+
+ domains.add(new BrowseResult(url, id));
}
}
}
@@ -239,15 +399,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@Override
public List getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
- final String q = """
- SELECT DOMAIN_ID, DOMAIN_NAME
- FROM EC_RANDOM_DOMAINS
- INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
- WHERE STATE<2
- AND DOMAIN_ALIAS IS NULL
- ORDER BY RAND()
- LIMIT ?
- """;
+ final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
List domains = new ArrayList<>(count);
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement(q)) {
@@ -258,7 +410,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
- domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
+ var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
+
+ domains.add(new BrowseResult(url, id));
}
}
}
@@ -274,7 +428,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
public EdgeDomain getDomain(EdgeId id) {
try (var connection = dataSource.getConnection()) {
- try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
+ try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
@@ -285,4 +439,330 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
+ @Override @SneakyThrows
+ public List> inboudUrls(EdgeId id, int limit) {
+
+ List> ret = new ArrayList<>();
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt =
+ connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
+ stmt.setFetchSize(limit);
+ stmt.setInt(1, id.getId());
+ stmt.setInt(2, limit);
+ var rsp = stmt.executeQuery();
+ while (rsp.next()) {
+ ret.add(new EdgeId<>(rsp.getInt(1)));
+ }
+ }
+
+ }
+
+ return ret;
+ }
+
+
+ @Override @SneakyThrows
+ public List> outboundUrls(EdgeId id, int limit) {
+
+ List> ret = new ArrayList<>();
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt =
+ connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
+ stmt.setFetchSize(limit);
+ stmt.setInt(1, id.getId());
+ stmt.setInt(2, limit);
+ var rsp = stmt.executeQuery();
+ while (rsp.next()) {
+ ret.add(new EdgeId<>(rsp.getInt(1)));
+ }
+ }
+
+ }
+
+ return ret;
+ }
+
+ @Override
+ public Optional> resolveAmbiguousDomain(String name) {
+ try (var connection = dataSource.getConnection()) {
+ try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
+ stmt.setString(1, name);
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return Optional.of(new EdgeId<>(rsp.getInt(1)));
+ }
+ }
+
+ try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
+ stmt.setString(1, "https://"+name);
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return Optional.of(new EdgeId<>(rsp.getInt(1)));
+ }
+ }
+
+ try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
+ stmt.setString(1, "http://"+name);
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return Optional.of(new EdgeId<>(rsp.getInt(1)));
+ }
+ }
+
+ try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
+ stmt.setString(1, "https://www."+name);
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return Optional.of(new EdgeId<>(rsp.getInt(1)));
+ }
+ }
+
+ try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
+ stmt.setString(1, "http://www."+name);
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return Optional.of(new EdgeId<>(rsp.getInt(1)));
+ }
+ }
+
+ } catch (SQLException throwables) {
+ logger.info("Could not resolve domain id for {}", name);
+ }
+
+ return Optional.empty();
+ }
+
+ @SneakyThrows
+ @Override
+ public int getPagesKnown(EdgeId domainId) {
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
+ stmt.setInt(1, domainId.getId());
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return rsp.getInt(1);
+ }
+ } catch (Exception ex) {
+ logger.error("DB error", ex);
+ }
+ return 0;
+ }
+ }
+
+ @SneakyThrows
+ @Override
+ public int getPagesVisited(EdgeId domainId) {
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
+ stmt.setInt(1, domainId.getId());
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return rsp.getInt(1);
+ }
+ } catch (Exception ex) {
+ logger.error("DB error", ex);
+ }
+ return 0;
+ }
+ }
+
+
+ @SneakyThrows
+ @Override
+ public int getPagesIndexed(EdgeId domainId) {
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
+ stmt.setInt(1, domainId.getId());
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return rsp.getInt(1);
+ }
+ } catch (Exception ex) {
+ logger.error("DB error", ex);
+ }
+ return 0;
+ }
+ }
+
+ @SneakyThrows
+ @Override
+ public int getIncomingLinks(EdgeId domainId) {
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
+ stmt.setInt(1, domainId.getId());
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return rsp.getInt(1);
+ }
+ } catch (Exception ex) {
+ logger.error("DB error", ex);
+ }
+ return 0;
+ }
+ }
+ @SneakyThrows
+ @Override
+ public int getOutboundLinks(EdgeId domainId) {
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
+ stmt.setInt(1, domainId.getId());
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return rsp.getInt(1);
+ }
+ } catch (Exception ex) {
+ logger.error("DB error", ex);
+ }
+ return 0;
+ }
+ }
+
+ @SneakyThrows
+ @Override
+ public double getDomainQuality(EdgeId domainId) {
+ try (var connection = dataSource.getConnection()) {
+
+ try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
+ stmt.setInt(1, domainId.getId());
+ var rsp = stmt.executeQuery();
+ if (rsp.next()) {
+ return rsp.getDouble(1);
+ }
+ } catch (Exception ex) {
+ logger.error("DB error", ex);
+ }
+ return -5;
+ }
+ }
+
+ @Override
+ public EdgeDomainIndexingState getDomainState(EdgeId