From 730e964475a373051764237b4120a1d56fbf29b4 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 30 May 2022 21:00:43 +0200 Subject: [PATCH 01/27] Tweaked madvise for index to be faster --- .../wmsa/edge/index/service/index/SearchIndex.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java index c25100f4..222c332e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java @@ -3,15 +3,18 @@ package nu.marginalia.wmsa.edge.index.service.index; import com.google.inject.Inject; import com.google.inject.name.Named; import com.upserve.uppend.blobs.NativeIO; +import io.reactivex.rxjava3.schedulers.Schedulers; import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.multimap.MultimapFileLong; +import org.eclipse.jetty.util.thread.ThreadPool; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; +import java.util.concurrent.ForkJoinPool; import java.util.stream.LongStream; public class SearchIndex implements AutoCloseable { @@ -40,12 +43,12 @@ public class SearchIndex implements AutoCloseable { bTreeReader = new BTreeReader(urls, SearchIndexConverter.urlsBTreeContext); - madvise(urls, bTreeReader); + Schedulers.io().scheduleDirect(() -> madvise(urls, bTreeReader)); } private void madvise(MultimapFileLong urls, BTreeReader reader) { - urls.advice(NativeIO.Advice.Sequential); + urls.advice(NativeIO.Advice.Random); words.forEachWordsOffset(offset -> { var h = reader.getHeader(offset); int length = (int) (h.dataOffsetLongs() - h.indexOffsetLongs()); From dc963d3e4476029b105dbdb4b6533bda1689a821 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 30 May 2022 20:59:04 +0200 Subject: [PATCH 02/27] Added instrumentation for search queries --- .../wmsa/edge/search/EdgeSearchOperator.java | 58 ++++++++----------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index dd37c515..10675cc5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.search; import com.google.inject.Inject; import com.google.inject.Singleton; +import io.prometheus.client.Summary; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.schedulers.Schedulers; import nu.marginalia.wmsa.configuration.server.Context; @@ -47,6 +48,9 @@ public class EdgeSearchOperator { private final SearchResultValuator valuator; private final Comparator resultListComparator; + private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register(); + private static final Summary wmsa_search_result_decoration_time = Summary.build().name("wmsa_search_result_decoration_time").help("-").register(); + @Inject public EdgeSearchOperator(AssistantClient assistantClient, EncyclopediaClient encyclopediaClient, @@ -141,27 +145,30 @@ public class EdgeSearchOperator { AccumulatedQueryResults queryResults = new AccumulatedQueryResults(); UrlDeduplicator deduplicator = new UrlDeduplicator(processedQuery.specs.limitByDomain); - - if (processedQuery.searchTermsHuman.size()<=4 && !asFastAsPossible) { - fetchResultsMulti(ctx, processedQuery, queryResults, deduplicator); - } - else { - fetchResultsSimple(ctx, processedQuery, queryResults, deduplicator); - } - List resultList = new ArrayList<>(queryResults.size()); - for (var details : queryResults.results) { - if (details.getUrlQuality() < -100) { - continue; + wmsa_search_index_api_time.time(() -> { + if (processedQuery.searchTermsHuman.size() <= 4 && !asFastAsPossible) { + fetchResultsMulti(ctx, processedQuery, queryResults, deduplicator); + } else { + fetchResultsSimple(ctx, processedQuery, queryResults, deduplicator); + } + }); + + wmsa_search_result_decoration_time.time(() -> { + for (var details : queryResults.results) { + if (details.getUrlQuality() < -100) { + continue; + } + var scoreAdjustment = adjustScoreBasedOnQuery(details, processedQuery.specs); + details = details.withUrlQualityAdjustment(scoreAdjustment); + + resultList.add(details); } - var scoreAdjustment = adjustScoreBasedOnQuery(details, processedQuery.specs); - details = details.withUrlQualityAdjustment(scoreAdjustment); - resultList.add(details); + resultList.sort(resultListComparator); } - - resultList.sort(resultListComparator); + ); return new DecoratedSearchResultSet(resultList); } @@ -254,31 +261,14 @@ public class EdgeSearchOperator { var blocksOrder = processedQuery.specs.subqueries.stream().map(sq -> sq.block).distinct().sorted(Comparator.comparing(block -> block.sortOrder)).toList(); + EdgeSearchSpecification[] specsArray = processedQuery.specs.subqueries.stream() .filter(sq -> sq.block == IndexBlock.TitleKeywords) .map(sq -> processedQuery.specs.withSubqueries(blocksOrder.stream().map(sq::withBlock).collect(Collectors.toList()))) - //.flatMap(specs -> processedQuery.specs.buckets.stream().map(bucket -> specs.withBuckets(List.of(bucket)))) .toArray(EdgeSearchSpecification[]::new); var resultSets = indexClient.multiQuery(ctx, specsArray); - if (debug) { - for (var s : specsArray) { - logger.info("{}", s); - } - for (IndexBlock block : indexBlockSearchOrder) { - resultSets.forEach(res -> { - res.resultsList.getOrDefault(block, Collections.emptyList()).forEach(b2 -> { - b2.results.forEach((idx,items) -> { - items.forEach(i -> - logger.info("{} {} - {}", block, idx, i) - ); - }); - }); - }); - } - } - Set> seenUrls = new HashSet<>(); for (IndexBlock block : indexBlockSearchOrder) { var resultsJoined = resultSets.stream().flatMap(rs -> rs.resultsList.getOrDefault(block, Collections.emptyList()).stream()) From 689412185923a98cdeada29453ff5aea1a54c5cd Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 30 May 2022 23:19:55 +0200 Subject: [PATCH 03/27] Tweaked madvise for index to be faster --- .../marginalia/wmsa/edge/index/service/index/SearchIndex.java | 3 +++ .../java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java index 222c332e..17e62437 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java @@ -53,8 +53,11 @@ public class SearchIndex implements AutoCloseable { var h = reader.getHeader(offset); int length = (int) (h.dataOffsetLongs() - h.indexOffsetLongs()); + urls.adviceRange(NativeIO.Advice.Normal, offset, 512); + if (length > 0) { urls.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); + urls.adviceRange(NativeIO.Advice.Normal, h.dataOffsetLongs(), 2048); urls.pokeRange(h.indexOffsetLongs(), length); } }); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java index a486e63d..329322a2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java @@ -8,7 +8,6 @@ import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.wmsa.api.model.ApiSearchResult; import nu.marginalia.wmsa.api.model.ApiSearchResults; -import nu.marginalia.wmsa.client.exception.TimeoutException; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; From fc070f2e0e5d6109c1175026659fd44c8407d90c Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 30 May 2022 23:41:13 +0200 Subject: [PATCH 04/27] Fixed memory alignment for MMFL --- .../java/nu/marginalia/util/multimap/MultimapFileLong.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java index 5f561485..dca8248e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java @@ -52,14 +52,15 @@ public class MultimapFileLong implements AutoCloseable { } private static int getBufferSize(long totalSize, boolean write) { + int defaultBig = 2<<23; if (totalSize > Integer.MAX_VALUE/WORD_SIZE) { - return (int)(Integer.MAX_VALUE/WORD_SIZE); + return defaultBig; } else if (write && totalSize < 8*1024*1024) { return 8*1024*1024; } else { - return (int) Math.min(totalSize, Integer.MAX_VALUE/WORD_SIZE); + return (int) Math.min(totalSize, defaultBig); } } From ec87c0689fa22018fceca36531a7a922f499bde5 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 31 May 2022 13:37:20 +0200 Subject: [PATCH 05/27] Added timeout to queries --- .../wmsa/edge/index/EdgeIndexService.java | 14 +++-------- .../service/index/SearchIndexReader.java | 2 ++ .../service/query/IndexQueryBuilder.java | 25 +++++++++++-------- .../service/query/IndexSearchBudget.java | 16 ++++++------ .../index/service/SearchIndexWriterTest.java | 2 +- 5 files changed, 28 insertions(+), 31 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index 3fa3625b..81d57139 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -41,7 +41,7 @@ import static spark.Spark.get; import static spark.Spark.halt; public class EdgeIndexService extends Service { - private static final int SEARCH_BUDGET_LIMIT = 1_000_000; + private static final int SEARCH_BUDGET_TIMEOUT_MS = 100; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -243,7 +243,7 @@ public class EdgeIndexService extends Service { new DomainResultCountFilter(specsSet.limitByDomain) }; - final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_LIMIT); + final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS); final TIntIntHashMap limitsPerBucketRemaining = new TIntIntHashMap(6, 0.7f, 0, specsSet.limitByBucket); for (int i = 0; i < specsSet.buckets.size(); i+=2) { @@ -279,10 +279,6 @@ public class EdgeIndexService extends Service { } } - if (budget.used() > 0) { - logger.debug("Query used ${}", budget.used()); - } - return results; } @@ -294,7 +290,7 @@ public class EdgeIndexService extends Service { final DomainResultCountFilter domainCountFilter = new DomainResultCountFilter(specsSet.limitByDomain); - IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_LIMIT); + IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS); for (var sq : specsSet.subqueries) { Optional searchTerms = getSearchTerms(sq); @@ -316,10 +312,6 @@ public class EdgeIndexService extends Service { } } - if (budget.used() > 0) { - logger.debug("Query used ${}", budget.used()); - } - return results; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java index df269034..7baeb8ae 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java @@ -74,7 +74,9 @@ public class SearchIndexReader implements AutoCloseable { IndexSearchBudget budget, LongPredicate filter, int wordId) { + var builder = underspecifiedQueryBuilders.get(block); + if (null != builder) { return builder.buildUnderspecified(budget, filter, wordId); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java index de3f1435..be217057 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java @@ -32,7 +32,9 @@ public class IndexQueryBuilder { return new QueryForIndices(budget, filter, wordId); } + // Special treatment for queries with few terms, prefer hits that appear in multiple buckets public Query buildUnderspecified(IndexSearchBudget budget, LongPredicate filter, int wordId) { + if (requiredIndices.size() == 1) { return build(budget, filter, wordId); } @@ -51,7 +53,7 @@ public class IndexQueryBuilder { return new QueryForIndices(budget, () -> Streams.concat(IntStream.range(1, relevantIndices.length) - .mapToObj(i -> underspecifiedPairStream(budget, (int) budget.limit()/(relevantIndices.length*2), relevantIndices[0], relevantIndices[i], wordId)) + .mapToObj(i -> underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[i], wordId)) .flatMapToLong(Function.identity()), fstRange.stream().takeWhile(budget::take)) .filter(filter) @@ -59,17 +61,20 @@ public class IndexQueryBuilder { } private LongStream underspecifiedPairStream(IndexSearchBudget budget, int limit, int firstIdx, int otherIdx, int wordId) { - SearchIndex first = requiredIndices.get(firstIdx), - second = requiredIndices.get(otherIdx); + SearchIndex firstTmp = requiredIndices.get(firstIdx), + secondTmp = requiredIndices.get(otherIdx); - if (first.numUrls(wordId) > second.numUrls(wordId)) { - SearchIndex tmp = first; - first = second; - second = tmp; + final SearchIndex fst; + final SearchIndex snd; + + if (firstTmp.numUrls(wordId) > secondTmp.numUrls(wordId)) { + fst = secondTmp; + snd = firstTmp; + } + else { + fst = firstTmp; + snd = secondTmp; } - - SearchIndex fst = first; - SearchIndex snd = second; var sndRange = snd.rangeForWord(wordId); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java index e84af8fe..2ec30e65 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java @@ -1,18 +1,16 @@ package nu.marginalia.wmsa.edge.index.service.query; -import lombok.RequiredArgsConstructor; -@RequiredArgsConstructor public class IndexSearchBudget { - private final long limit; - private long used = 0; + private long timeout; + public IndexSearchBudget(long limitTime) { + this.timeout = System.currentTimeMillis() + limitTime; + } + + // Used for short-circuiting Stream-objects using takeWhile, we don't care public boolean take(long unused) { - return used++ < limit; + return System.currentTimeMillis() < timeout; } - public long used() { - return used; - } - public long limit() { return limit; } } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java index f9cd8a6a..5f1d2a0f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java @@ -59,7 +59,7 @@ class SearchIndexWriterTest { } public long[] findWord(SearchIndexReader reader, String word, IndexBlock block) { - IndexSearchBudget budget = new IndexSearchBudget(1_000_000); + IndexSearchBudget budget = new IndexSearchBudget(100); return reader.findWord(block, budget, lv->true, dictionaryWriter.getReadOnly(word)).stream().toArray(); } From ab9704430224d6603674beb6d0e1a76c2f5f3e60 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 31 May 2022 13:40:18 +0200 Subject: [PATCH 06/27] Fix deprecation warning for Bucket4J --- .../nu/marginalia/wmsa/configuration/server/RateLimiter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java index 4dc4c8da..06a6131a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java @@ -66,6 +66,6 @@ public class RateLimiter { private Bucket createBucket() { var refill = Refill.greedy(1, Duration.ofSeconds(refillRate)); var bw = Bandwidth.classic(capacity, refill); - return Bucket4j.builder().addLimit(bw).build(); + return Bucket.builder().addLimit(bw).build(); } } From 3679d433d99363d45e3101799d4a5d20457c2d49 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 31 May 2022 14:15:20 +0200 Subject: [PATCH 07/27] Update 'README.md' --- README.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 2fa76c4c..cfe88bc9 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,18 @@ This is the source code for marginalia.nu, including the [search engine](https://search.marginalia.nu), the [MEMEX/gemini server](https://memex.marginalia.nu), the and the [encyclopedia service](https://encyclopedia.marginalia.nu). -The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu), -it is fine to mirror it on other hosts, but if you have issues or questions -that is where you want to go. +The aim of the project is to develop new and alternative discovery methods for the Internet. +It's an experimental workshop as much as it is a public service, the overarching goal is to +elevate the more human, non-commercial sides of the Internet. -As it stands now, the project is a bit of a mess as it wasn't developed -with the intention of going open source, a lot of tests and so on make -assumptions about the directory structure, much configuration is hard coded -and so on. Please stand by. A lot of the mess is fairly superficial. +The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu). +It is fine to mirror it on other hosts, but if you have issues or questions +git.marginalia.nu is where you want to go. + +As it stands now, the project is still being set up and is a bit of a mess as +it wasn't developed with the intention of going open source, a lot of tests +and so on make assumptions about the directory structure, much configuration +is hard coded and so on. Please stand by. A lot of the mess is fairly superficial. ## Contributing From 046b92e0bb31e15699378e41a85d700cc9a2d395 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 31 May 2022 14:33:59 +0200 Subject: [PATCH 08/27] Cleaning up index code --- .../wmsa/edge/index/EdgeIndexService.java | 42 ++++--------------- .../service/query/IndexSearchBudget.java | 2 +- 2 files changed, 10 insertions(+), 34 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index 81d57139..a04a4c83 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -331,8 +331,6 @@ public class EdgeIndexService extends Service { final Map> results = new HashMap<>(); final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain); - boolean debug = sq.searchTermsExclude.contains("special:debug"); - for (int i : specBuckets) { int foundResultsCount = results.values().stream().mapToInt(List::size).sum(); @@ -341,37 +339,15 @@ public class EdgeIndexService extends Service { List resultsForBucket = new ArrayList<>(specs.limitByBucket); - if (debug) { - getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms) - .peek(l -> logger.info("Considering {}", Long.toHexString(l))) - .mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id)) - .filter(ri -> { - if (seenResults.contains(ri.url.getId())) { - logger.info("Seen before: {}", Integer.toHexString(ri.url.getId())); - return false; - } - else if (!localFilter.test(i, domainCountFilter, ri)) { - logger.info("DCF: {} - {}:{}", ri.blockId, Integer.toHexString(ri.domain.getId()), Integer.toHexString(ri.url.getId())); - return false; - } - return true; - }) - .limit(specs.limitTotal * 3L) - .distinct() - .limit(Math.min(specs.limitByBucket - - results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount)) - .forEach(resultsForBucket::add); - } - else { - getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms) - .mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id)) - .filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri)) - .limit(specs.limitTotal * 3L) - .distinct() - .limit(Math.min(specs.limitByBucket - - results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount)) - .forEach(resultsForBucket::add); - } + getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms) + .mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id)) + .filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri)) + .limit(specs.limitTotal * 3L) + .distinct() + .limit(Math.min(specs.limitByBucket + - results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount)) + .forEach(resultsForBucket::add); + for (var result : resultsForBucket) { seenResults.add(result.url.getId()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java index 2ec30e65..5d18afa4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.index.service.query; public class IndexSearchBudget { - private long timeout; + private final long timeout; public IndexSearchBudget(long limitTime) { this.timeout = System.currentTimeMillis() + limitTime; From c0e0579c8e6f911df3f4a07944b38adc357c8689 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 31 May 2022 14:35:02 +0200 Subject: [PATCH 09/27] Updated index.html for search engine to reflect changes in project status. --- .../src/main/resources/static/edge/index.html | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/marginalia_nu/src/main/resources/static/edge/index.html b/marginalia_nu/src/main/resources/static/edge/index.html index 13044a6c..166e67b8 100644 --- a/marginalia_nu/src/main/resources/static/edge/index.html +++ b/marginalia_nu/src/main/resources/static/edge/index.html @@ -61,8 +61,12 @@ existed.

The software for this search engine is all custom-built, and all crawling and indexing is - done in-house. + done in-house. The project is open source. Feel free to poke about in the source code or contribute + to the development!

+

Consider supporting the + project!

Read More @@ -98,11 +102,6 @@

Updates

-

☛ The web design of the search engine has been completely overhauled. For the most part, this should - result in even smaller page loads, and better accessibility and easier navigation, but it may still - be a bit rough in some browsers, if you do find any bugs or accessibility problems, please let me - know. You can reach me at kontakt@marginalia.nu. -

☛ The Random Mode has been overhauled, and is quite entertaining. I encourage you to give it a spin.

☛ A simple public API is now available.

@@ -116,6 +115,8 @@

Publicity, Discussion and Events

+
Marginalia Goes Open Source
+
Hacker News, 2022-05-28
You Should Check Out the Indie Web 🎞️
YouTube, You've Got Kat, 2022-03-15
@@ -137,10 +138,10 @@
Clive Thompson OneZero, 2021-09-16
- Hacker News Discussion + A search engine that favors text-heavy sites and punishes modern web design
- 2021-09-16 + Hacker News, 2021-09-16
From d8d0c0e5b2a50c0b7902bc319de428ff62da7eb7 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 1 Jun 2022 14:46:51 +0200 Subject: [PATCH 10/27] Make User-agent configurable. --- marginalia_nu/src/e2e/resources/crawl.sh | 2 + .../wmsa/configuration/UserAgent.java | 5 ++ .../wmsa/configuration/WmsaHome.java | 11 +++++ .../module/ConfigurationModule.java | 30 +++--------- .../module/HostnameProvider.java | 36 --------------- .../configuration/module/PortProvider.java | 46 ------------------- .../wmsa/edge/crawling/CrawlerMain.java | 6 ++- 7 files changed, 29 insertions(+), 107 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/UserAgent.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/HostnameProvider.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/PortProvider.java diff --git a/marginalia_nu/src/e2e/resources/crawl.sh b/marginalia_nu/src/e2e/resources/crawl.sh index 3a0e4b01..16d43fab 100644 --- a/marginalia_nu/src/e2e/resources/crawl.sh +++ b/marginalia_nu/src/e2e/resources/crawl.sh @@ -3,6 +3,8 @@ mkdir -p /var/lib/wmsa/conf/ mkdir -p /var/lib/wmsa/data/ +echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent + cat > /var/lib/wmsa/conf/db.properties < { - private static final String DEFAULT_HOSTNAME = "127.0.0.1"; - private final int monitorPort; - private final String monitorHost; - private final int timeout; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Inject - public HostnameProvider(@Named("monitor-port") Integer monitorPort, - @Named("monitor-host") String monitorHost, - @Named("monitor-boot-timeout") Integer timeout - ) { - this.monitorHost = monitorHost; - this.monitorPort = monitorPort; - this.timeout = timeout; - } - - @Override - public String get() { - var override = System.getProperty("service-host"); - if (null != override) { - return override; - } - return DEFAULT_HOSTNAME; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/PortProvider.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/PortProvider.java deleted file mode 100644 index 7286aa68..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/PortProvider.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.wmsa.configuration.module; - -import com.google.inject.name.Named; -import io.reactivex.rxjava3.core.Flowable; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import org.apache.http.HttpResponse; -import org.reactivestreams.Publisher; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.inject.Inject; -import javax.inject.Provider; -import java.io.IOException; -import java.util.concurrent.TimeUnit; - -public class PortProvider implements Provider { - private static final Integer DEFAULT_PORT = 5000; - private final int monitorPort; - private final String monitorHost; - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final int timeout = 10; - @Inject - public PortProvider(@Named("monitor-port") Integer monitorPort, - @Named("monitor-host") String monitorHost, - @Named("monitor-boot-timeout") Integer timeout) { - this.monitorHost = monitorHost; - this.monitorPort = monitorPort; - } - - @Override - public Integer get() { - return ServiceDescriptor.byName(System.getProperty("service-name")).port; - } - - private Publisher repeatDelay(Flowable error) { - return error.delay(1, TimeUnit.SECONDS); - } - - private String accept200(HttpResponse rsp) throws IOException { - if (rsp.getStatusLine().getStatusCode() != 200) { - throw new RuntimeException("Monitor responded unexpected status " - + rsp.getStatusLine().getStatusCode()); - } - return new String(rsp.getEntity().getContent().readAllBytes()); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java index d81e348b..7238dce0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java @@ -2,6 +2,8 @@ package nu.marginalia.wmsa.edge.crawling; import com.google.gson.Gson; import com.google.gson.GsonBuilder; +import nu.marginalia.wmsa.configuration.UserAgent; +import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver; @@ -34,10 +36,12 @@ public class CrawlerMain implements AutoCloseable { private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); + private final UserAgent userAgent; public CrawlerMain(EdgeCrawlPlan plan) throws Exception { this.inputSpec = plan.getJobSpec(); this.numberOfThreads = 512; + this.userAgent = WmsaHome.getUserAgent(); workLog = new WorkLog(plan.crawl.getLogFile()); domainWriter = new CrawledDomainWriter(plan.crawl.getDir()); @@ -88,7 +92,7 @@ public class CrawlerMain implements AutoCloseable { if (workLog.isJobFinished(specification.id)) return null; - var fetcher = new HttpFetcher("search.marginalia.nu", dispatcher); + var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher); try { var retreiver = new CrawlerRetreiver(fetcher, specification); From dbbef2d119c2230e8134b6113d9384976a3d6a84 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 1 Jun 2022 18:00:30 +0200 Subject: [PATCH 11/27] Update 'README.md' --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cfe88bc9..0b75e796 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,8 @@ the [MEMEX/gemini server](https://memex.marginalia.nu), the and the [encyclopedi The aim of the project is to develop new and alternative discovery methods for the Internet. It's an experimental workshop as much as it is a public service, the overarching goal is to -elevate the more human, non-commercial sides of the Internet. +elevate the more human, non-commercial sides of the Internet. A side-goal is to do this without +requiring datacenters and expensive enterprise hardware, to run this operation on affordable hardware. The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu). It is fine to mirror it on other hosts, but if you have issues or questions From 43fed1806330b1fdd0145a1ac90e9524f0aedb8b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 2 Jun 2022 02:30:29 +0200 Subject: [PATCH 12/27] Update 'README.md' --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 0b75e796..a886ad2a 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,10 @@ it wasn't developed with the intention of going open source, a lot of tests and so on make assumptions about the directory structure, much configuration is hard coded and so on. Please stand by. A lot of the mess is fairly superficial. +## Documentation + +Documentation is a work in progress. See the [wiki](https://git.marginalia.nu/marginalia/marginalia.nu/wiki). + ## Contributing The project is still being set up, but if you are interested in contributing, please contact me. From 0e65384781d1d82bd2eda8e9ac096dcd5c6d319f Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 3 Jun 2022 13:32:05 +0200 Subject: [PATCH 13/27] Make WMSA_HOME configurable through an environment variable. --- .../nu/marginalia/wmsa/edge/E2ETestBase.java | 2 +- marginalia_nu/src/e2e/resources/init.sh | 33 ++++++++----- .../wmsa/configuration/WmsaHome.java | 49 ++++++++++++++----- .../edge/assistant/EdgeAssistantModule.java | 13 ++--- .../wmsa/edge/converting/ConverterModule.java | 10 +--- .../wmsa/edge/index/EdgeTablesModule.java | 9 ++-- .../wmsa/edge/search/EdgeSearchModule.java | 13 +---- .../resource_store/ResourceStoreModule.java | 1 - 8 files changed, 70 insertions(+), 60 deletions(-) diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java index 769eca40..0c329a79 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java @@ -33,7 +33,7 @@ public abstract class E2ETestBase { .withCopyFileToContainer(jarFile(), "/WMSA.jar") .withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh") .withExposedPorts(service.port) - .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) + .withFileSystemBind(modelsPath(), "/wmsa/model", BindMode.READ_ONLY) .withNetwork(network) .withNetworkAliases(service.name) .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name))) diff --git a/marginalia_nu/src/e2e/resources/init.sh b/marginalia_nu/src/e2e/resources/init.sh index a6aaea3b..5409f787 100644 --- a/marginalia_nu/src/e2e/resources/init.sh +++ b/marginalia_nu/src/e2e/resources/init.sh @@ -1,13 +1,15 @@ #!/bin/bash -mkdir -p /var/lib/wmsa/encyclopedia -mkdir -p /var/lib/wmsa/conf -mkdir -p /var/lib/wmsa/index/write -mkdir -p /var/lib/wmsa/index/read -mkdir -p /backup/work/index-tmp +HOME=/wmsa -mkdir -p /var/log/wmsa -cat > /var/lib/wmsa/suggestions.txt < ${HOME}/suggestions.txt < /var/lib/wmsa/conf/disks.properties < ${HOME}/conf/disks.properties < /var/lib/wmsa/conf/db.properties < ${HOME}/conf/db.properties < /var/lib/wmsa/conf/ranking-settings.yaml < ${HOME}/conf/ranking-settings.yaml < /var/lib/wmsa/conf/hosts < ${HOME}/conf/hosts < Date: Tue, 7 Jun 2022 22:34:53 +0200 Subject: [PATCH 14/27] WIP: Database refactoring --- .../marginalia/util/ranking/AcademiaRank.java | 49 --- .../util/ranking/RankingAlgorithm.java | 20 +- .../ranking/old/OldReversePageRankV2.java | 4 +- .../util/ranking/old/StandardPageRank.java | 4 +- .../util/ranking/tool/DedupTool.java | 2 +- .../util/ranking/tool/PerusePageRankV2.java | 4 +- .../ranking/tool/TestAcademiaRankTool.java | 30 -- .../ranking/tool/UpdateDomainRanksTool.java | 5 - .../ranking/tool/UpdateDomainRanksTool2.java | 3 - .../edge/converting/ReindexTriggerMain.java | 4 +- .../converting/interpreter/Interpreter.java | 2 +- .../instruction/LoadProcessedDomain.java | 4 +- .../wmsa/edge/converting/loader/Loader.java | 6 +- .../converting/loader/SqlLoadDomainLinks.java | 6 +- .../converting/loader/SqlLoadDomains.java | 27 +- .../loader/SqlLoadProcessedDocument.java | 25 +- .../loader/SqlLoadProcessedDomain.java | 27 +- .../edge/converting/loader/SqlLoadUrls.java | 14 +- .../processor/InstructionsCompiler.java | 2 +- .../edge/crawling/CrawlJobExtractorMain.java | 11 +- .../CrawlJobExtractorPageRankMain.java | 8 +- .../wmsa/edge/data/dao/EdgeDataStoreDao.java | 11 - .../edge/data/dao/EdgeDataStoreDaoImpl.java | 326 +++--------------- .../dao/task/EdgeDomainBlacklistImpl.java | 2 +- .../edge/index/service/SearchIndexDao.java | 18 +- .../edge/model/search/EdgeUrlDetails.java | 17 +- .../search/results/SearchResultDecorator.java | 2 +- .../main/resources/sql/edge-crawler-cache.sql | 168 +++------ .../java/nu/marginalia/util/TestUtil.java | 2 +- .../loader/SqlLoadDomainLinksTest.java | 51 +++ .../converting/loader/SqlLoadDomainsTest.java | 54 +++ .../loader/SqlLoadProcessedDocumentTest.java | 68 ++++ .../loader/SqlLoadProcessedDomainTest.java | 52 +++ .../converting/loader/SqlLoadUrlsTest.java | 49 +++ 34 files changed, 451 insertions(+), 626 deletions(-) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java deleted file mode 100644 index 272a1798..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java +++ /dev/null @@ -1,49 +0,0 @@ -package nu.marginalia.util.ranking; - -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.list.TIntList; -import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.hash.TIntIntHashMap; -import it.unimi.dsi.fastutil.ints.IntArrays; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.sql.SQLException; - -public class AcademiaRank { - private final TIntArrayList result; - private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class); - - public AcademiaRank(HikariDataSource ds, String... origins) throws IOException { - - TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000); - TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000); - - for (int i = 0; i < rankingResults.size(); i++) { - idToRanking.put(rankingResults.get(i), i); - } - - result = new TIntArrayList(10000); - try (var conn = ds.getConnection(); - var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) { - - stmt.setFetchSize(1000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - result.add(rsp.getInt(1)); - } - } - catch (SQLException ex) { - logger.error("SQL error", ex); - } - - int[] internalArray = result.toArray(); - IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b)); - result.set(0, internalArray); - } - - public TIntArrayList getResult() { - return result; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java index fd76989c..b07285d4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java @@ -72,10 +72,10 @@ public abstract class RankingAlgorithm { String s; if (getNames) { - s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; } else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; } try (var stmt = conn.prepareStatement(s)) { stmt.setFetchSize(10000); @@ -84,7 +84,7 @@ public abstract class RankingAlgorithm { int id = rsp.getInt(1); if (!spamDomains.contains(id)) { - domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false)); + domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false)); domainIndexToId.put(domainIndexToId.size(), id); domainIdToIndex.put(id, domainIdToIndex.size()); @@ -125,7 +125,7 @@ public abstract class RankingAlgorithm { } } - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) { + try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) { for (var seed : this.originDomains) { stmt.setString(1, seed); var rsp = stmt.executeQuery(); @@ -159,10 +159,10 @@ public abstract class RankingAlgorithm { try (var conn = dataSource.getConnection()) { String s; if (getNames) { - s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; } else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; } try (var stmt = conn.prepareStatement(s)) { stmt.setFetchSize(10000); @@ -172,7 +172,7 @@ public abstract class RankingAlgorithm { int id = rsp.getInt(1); if (!spamDomains.contains(id)) { - domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true)); + domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), true)); domainIndexToId.put(domainIndexToId.size(), id); domainIdToIndex.put(id, domainIdToIndex.size()); @@ -451,7 +451,7 @@ public abstract class RankingAlgorithm { public final int id; public final String name; private int alias; - private int state; + private EdgeDomainIndexingState state; public final int knownUrls; public boolean peripheral; @@ -465,11 +465,11 @@ public abstract class RankingAlgorithm { } public boolean isSpecial() { - return EdgeDomainIndexingState.SPECIAL.code == state; + return EdgeDomainIndexingState.SPECIAL == state; } public boolean isSocialMedia() { - return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state; + return EdgeDomainIndexingState.SOCIAL_MEDIA == state; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java index 6a214278..02823563 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java @@ -66,7 +66,7 @@ public class OldReversePageRankV2 { originDomains.add("memex.marginalia.nu"); try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) { + try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) { stmt.setFetchSize(10000); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -90,7 +90,7 @@ public class OldReversePageRankV2 { } } - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { stmt.setFetchSize(10000); for (var seed : this.originDomains) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java index c42b28dd..74bef70a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java @@ -48,7 +48,7 @@ public class StandardPageRank { originDomains.addAll(Arrays.asList(origins)); try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) { + try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) { stmt.setFetchSize(10000); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -78,7 +78,7 @@ public class StandardPageRank { } } - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { for (var seed : this.originDomains) { stmt.setString(1, seed); var rsp = stmt.executeQuery(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java index a5ea8b06..d6f95f51 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java @@ -50,7 +50,7 @@ public class DedupTool { Map>> domainToHashToUrl = new HashMap<>(); try (var conn = ds.getConnection(); - var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL"); + var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL"); var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?") ) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java index 85a691c2..3f3ce6a5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java @@ -112,10 +112,10 @@ public class PerusePageRankV2 { try (var conn = dataSource.getConnection()) { String s; if (getNames) { - s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID"; } else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID"; } try (var stmt = conn.prepareStatement(s)) { stmt.setFetchSize(10000); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java deleted file mode 100644 index 38192b35..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.util.ranking.tool; - -import lombok.SneakyThrows; -import nu.marginalia.util.ranking.AcademiaRank; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import org.mariadb.jdbc.Driver; - -import java.io.IOException; - -public class TestAcademiaRankTool { - - @SneakyThrows - public static void main(String... args) { - Driver driver = new Driver(); - var conn = new DatabaseModule().provideConnection(); - - var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu"); - var res = rank.getResult(); - - try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) { - for (int i = 0; i < Math.min(res.size(), 100); i++) { - stmt.setInt(1, res.getQuick(i)); - var rsp = stmt.executeQuery(); - while (rsp.next()) - System.out.println(rsp.getString(1)); - } - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java index 71ec72a6..5660d9a7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java @@ -83,11 +83,6 @@ public class UpdateDomainRanksTool { } } - logger.info("Recalculating quality"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) { - stmt.executeUpdate(); - } - } catch (SQLException | InterruptedException throwables) { throwables.printStackTrace(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java index 336b35fd..ec48cd17 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java @@ -94,9 +94,6 @@ public class UpdateDomainRanksTool2 { } logger.info("Recalculating quality"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) { - stmt.executeUpdate(); - } } catch (SQLException | InterruptedException throwables) { throwables.printStackTrace(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java index 050152bc..55648dfd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java @@ -29,7 +29,7 @@ public class ReindexTriggerMain { .build(); try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) { - var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); + var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); while (rs.next()) { System.out.printf("%d %s %s %d\n", rs.getInt(1), @@ -38,7 +38,7 @@ public class ReindexTriggerMain { rs.getInt(4)); } - rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100"); + rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100"); while (rs.next()) { System.out.printf("%d %d %s %d %s\n", rs.getInt(1), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java index 8755716c..c0698dde 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java @@ -14,7 +14,7 @@ public interface Interpreter { void loadRssFeed(EdgeUrl[] rssFeed); void loadDomainLink(DomainLink[] links); - void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality); + void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip); void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java index 065d6211..2b1fd631 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java @@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction { +public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction { @Override public void apply(Interpreter interpreter) { - interpreter.loadProcessedDomain(domain, state, quality); + interpreter.loadProcessedDomain(domain, state, ip); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java index 140a762a..49a39457 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java @@ -76,9 +76,9 @@ public class Loader implements Interpreter { } @Override - public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) { - logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality); - sqlLoadProcessedDomain.load(data, domain, state, quality); + public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) { + logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip); + sqlLoadProcessedDomain.load(data, domain, state, ip); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java index e0978828..6750bd33 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java @@ -30,7 +30,7 @@ public class SqlLoadDomainLinks { INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID) SELECT SOURCE.ID,DEST.ID FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST - ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN; + ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN; END """); } @@ -61,8 +61,8 @@ public class SqlLoadDomainLinks { } } } - catch (SQLException sql) { - sql.printStackTrace(); + catch (SQLException ex) { + logger.warn("SQL error inserting domain links", ex); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java index 18cc40bd..76a839c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java @@ -25,15 +25,9 @@ public class SqlLoadDomains { stmt.execute(""" CREATE PROCEDURE INSERT_DOMAIN ( IN DOMAIN_NAME VARCHAR(255), - IN SUB_DOMAIN VARCHAR(255), IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci) BEGIN - INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN); - - INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID) - SELECT DOMAIN_NAME,SUB_DOMAIN,ID - FROM EC_TOP_DOMAIN - WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN; + INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN); END """); } @@ -46,10 +40,9 @@ public class SqlLoadDomains { public void load(LoaderData data, EdgeDomain domain) { try (var connection = dataSource.getConnection()) { - try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) { + try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) { insertCall.setString(1, domain.toString()); - insertCall.setString(2, domain.subDomain); - insertCall.setString(3, domain.domain); + insertCall.setString(2, domain.domain); insertCall.addBatch(); var ret = insertCall.executeUpdate(); @@ -57,12 +50,11 @@ public class SqlLoadDomains { logger.warn("load({}) -- bad row count {}", domain, ret); } - connection.commit(); findIdForTargetDomain(connection, data); } } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error inserting domain", ex); } @@ -73,12 +65,11 @@ public class SqlLoadDomains { try (var connection = dataSource.getConnection()) { connection.setAutoCommit(false); - try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) { + try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) { for (var domain : domains) { insertCall.setString(1, domain.toString()); - insertCall.setString(2, domain.subDomain); - insertCall.setString(3, domain.domain); + insertCall.setString(2, domain.domain); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -95,7 +86,7 @@ public class SqlLoadDomains { findIdForTargetDomain(connection, data); } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error inserting domains", ex); } } @@ -104,7 +95,7 @@ public class SqlLoadDomains { return; } - try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) + try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { var targetDomain = data.getTargetDomain(); @@ -118,7 +109,7 @@ public class SqlLoadDomains { } } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error finding id for domain", ex); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index b25a657b..b033e6ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -31,14 +31,13 @@ public class SqlLoadProcessedDocument { IN TITLE VARCHAR(255), IN DESCRIPTION VARCHAR(255), IN LENGTH INT, - IN QUALITY_MEASURE DOUBLE, IN FEATURES INT, IN STANDARD VARCHAR(32), IN HASH INT) BEGIN SET FOREIGN_KEY_CHECKS=0; - REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES); - UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID; + REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH); + UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; SET FOREIGN_KEY_CHECKS=1; END """); @@ -47,7 +46,7 @@ public class SqlLoadProcessedDocument { IN URL_ID INT, IN STATE VARCHAR(32)) BEGIN - UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID; + UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; END """); @@ -60,7 +59,8 @@ public class SqlLoadProcessedDocument { public void load(LoaderData data, List documents) { try (var conn = dataSource.getConnection(); - var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) { + var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?)")) { + conn.setAutoCommit(false); for (var doc : documents) { int urlId = data.getUrlId(doc.url()); @@ -74,10 +74,9 @@ public class SqlLoadProcessedDocument { stmt.setString(3, doc.title()); stmt.setString(4, doc.description()); stmt.setInt(5, doc.length()); - stmt.setDouble(6, doc.quality()); - stmt.setInt(7, doc.htmlFeatures()); - stmt.setString(8, doc.standard().name()); - stmt.setInt(9, (int) doc.hash()); + stmt.setInt(6, doc.htmlFeatures()); + stmt.setString(7, doc.standard().name()); + stmt.setInt(8, (int) doc.hash()); stmt.addBatch(); } var ret = stmt.executeBatch(); @@ -89,8 +88,8 @@ public class SqlLoadProcessedDocument { } conn.commit(); - } catch (SQLException e) { - e.printStackTrace(); + } catch (SQLException ex) { + logger.warn("SQL error inserting document", ex); } @@ -117,8 +116,8 @@ public class SqlLoadProcessedDocument { logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]); } } - } catch (SQLException e) { - e.printStackTrace(); + } catch (SQLException ex) { + logger.warn("SQL error inserting failed document", ex); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java index 64607b3a..018d76c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java @@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain { stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN"); stmt.execute(""" CREATE PROCEDURE INITIALIZE_DOMAIN ( - IN ST INT, + IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'), IN IDX INT, - IN QUAL DOUBLE, - IN DID INT) + IN DID INT, + IN IP VARCHAR(32)) BEGIN - UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID; + UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; END """); @@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain { } } - public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) { + public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) { data.setTargetDomain(domain); loadDomains.load(data, domain); @@ -49,18 +49,17 @@ public class SqlLoadProcessedDomain { try (var conn = dataSource.getConnection(); var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)")) { - initCall.setInt(1, state.code); + initCall.setString(1, state.name()); initCall.setInt(2, 1 + data.sizeHint / 100); - initCall.setDouble(3, quality); - initCall.setInt(4, data.getDomainId(domain)); + initCall.setInt(3, data.getDomainId(domain)); + initCall.setString(4, ip); int rc = initCall.executeUpdate(); if (rc < 1) { - logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc); + logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); } - conn.commit(); } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error initializing domain", ex); } } @@ -69,9 +68,9 @@ public class SqlLoadProcessedDomain { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" UPDATE EC_DOMAIN TARGET - INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=? + INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=? SET TARGET.DOMAIN_ALIAS=ALIAS.ID - WHERE TARGET.URL_PART=? + WHERE TARGET.DOMAIN_NAME=? """)) { stmt.setString(1, link.to().toString()); stmt.setString(2, link.from().toString()); @@ -81,7 +80,7 @@ public class SqlLoadProcessedDomain { } } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error inserting domain alias", ex); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index 7d8851ca..fd698c82 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -25,12 +25,13 @@ public class SqlLoadUrls { stmt.execute(""" CREATE PROCEDURE INSERT_URL ( IN PROTO VARCHAR(255), - IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, + IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN PORT INT, - IN URL VARCHAR(255) + IN PATH VARCHAR(255), + IN PATH_HASH INT ) BEGIN - INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME; + INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; END """); } @@ -42,8 +43,8 @@ public class SqlLoadUrls { public void load(LoaderData data, EdgeUrl[] urls) { try (var conn = dataSource.getConnection(); - var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)"); - var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?") + var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)"); + var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?") ) { conn.setAutoCommit(false); @@ -58,6 +59,7 @@ public class SqlLoadUrls { insertCall.setNull(3, Types.INTEGER); } insertCall.setString(4, url.path); + insertCall.setInt(5, url.path.hashCode()); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -86,7 +88,7 @@ public class SqlLoadUrls { } catch (SQLException ex) { - ex.printStackTrace(); + logger.warn("SQL error inserting URLs", ex); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java index d36cb830..b75de436 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java @@ -15,7 +15,7 @@ public class InstructionsCompiler { public List compile(ProcessedDomain domain) { List ret = new ArrayList<>(domain.size()*4); - ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.))); + ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); if (domain.documents != null) { compileUrls(ret, domain.documents); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java index 2f25d6d7..52fe338a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java @@ -34,11 +34,10 @@ public class CrawlJobExtractorMain { private static final String domainsSql = """ - SELECT ID, LOWER(EC_DOMAIN.URL_PART) + SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME) FROM EC_DOMAIN - WHERE QUALITY_RAW>-100 - AND INDEXED>0 - AND STATE<2 + WHERE INDEXED>0 + AND STATE='ACTIVE' OR STATE='EXHAUSTED' ORDER BY INDEX_DATE ASC, DISCOVER_DATE ASC, @@ -49,8 +48,8 @@ public class CrawlJobExtractorMain { private static final String urlsSql = """ - SELECT CONCAT(PROTO, "://", ?, URL) - FROM EC_URL + SELECT URL + FROM EC_URL_VIEW WHERE DOMAIN_ID=? ORDER BY VISITED DESC, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java index 21935fd0..53997194 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java @@ -30,19 +30,19 @@ public class CrawlJobExtractorPageRankMain { """ SELECT ID FROM EC_DOMAIN - WHERE URL_PART=? + WHERE DOMAIN_NAME=? """; private static final String specificDomainSqlFromId = """ - SELECT LOWER(URL_PART) + SELECT LOWER(DOMAIN_NAME) FROM EC_DOMAIN WHERE ID=? """; private static final String urlsSql = """ - SELECT CONCAT(PROTO, "://", ?, URL) - FROM EC_URL + SELECT URL + FROM EC_URL_VIEW WHERE DOMAIN_ID=? ORDER BY VISITED DESC, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java index 81e8dd58..c87088f6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java @@ -16,22 +16,14 @@ public interface EdgeDataStoreDao { boolean isBlacklisted(EdgeDomain domain); EdgeId getDomainId(EdgeDomain domain); - EdgeId getUrlId(EdgeUrl domain); - EdgeUrl getUrl(EdgeId id); - EdgeUrlDetails getUrlDetails(EdgeId id); - List getDomainNeighbors(EdgeId domainId, EdgeDomainBlacklist backlist, int count); List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist backlist, int count); List getRandomDomains(int count, EdgeDomainBlacklist backlist); List getUrlDetailsMulti(List> ids); - List> getDomainIdsFromUrlIds(Collection> urlIds); EdgeDomain getDomain(EdgeId id); - List> inboudUrls(EdgeId id, int limit); - List> outboundUrls(EdgeId id, int limit); - Optional> resolveAmbiguousDomain(String name); @@ -48,9 +40,6 @@ public interface EdgeDataStoreDao { List getLinkingDomains(EdgeId domainId); - List getNewUrls(EdgeId domainId, Collection links); - double getRank(EdgeId domainId); - void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index a214bb15..2519a745 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -71,7 +71,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { try (var connection = dataSource.getConnection()) { return domainIdCache.get(domain, () -> { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { stmt.setString(1, domain.toString()); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -86,104 +86,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - @Override - @SneakyThrows - public EdgeId getUrlId(EdgeUrl url) { - try (var connection = dataSource.getConnection()) { - - return urlIdCache.get(url, () -> { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) { - stmt.setString(1, url.path); - stmt.setString(2, url.domain.toString()); - stmt.setString(3, url.proto); - - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return new EdgeId<>(rsp.getInt(1)); - } - } - // Lenient mode for http->https upgrades etc - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) { - stmt.setString(1, url.path); - stmt.setString(2, url.domain.toString()); - - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return new EdgeId<>(rsp.getInt(1)); - } - } - throw new NoSuchElementException(url.toString()); - }); - } - catch (UncheckedExecutionException ex) { - throw ex.getCause(); + private String idList(List> ids) { + StringJoiner j = new StringJoiner(",", "(", ")"); + for (var id : ids) { + j.add(Integer.toString(id.getId())); } + return j.toString(); } - - @SneakyThrows - @Override - public List> getDomainIdsFromUrlIds(Collection> urlIds) { - List> results = new ArrayList<>(urlIds.size()); - - if (urlIds.isEmpty()) - return results; - - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds - .stream() - .map(EdgeId::getId) - .map(Object::toString) - .collect(Collectors.joining(",", "(", ")")))) - { - var rsp = stmt.executeQuery(); - while (rsp.next()) { - results.add(new EdgeId<>(rsp.getInt(1))); - } - - } - } - - return results; - } - - static final Pattern badChars = Pattern.compile("[';\\\\]"); - private String saneString(String s) { - return "\'"+badChars.matcher(s).replaceAll("?")+"\'"; - } - @SneakyThrows - @Override - public EdgeUrl getUrl(EdgeId id) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.createStatement()) { - var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId()); - if (rsp.next()) { - return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4)); - } - throw new NoSuchElementException(); - } - } - } - - @SneakyThrows - @Override - public EdgeUrlDetails getUrlDetails(EdgeId id) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.createStatement()) { - var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId()); - if (rsp.next()) { - EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5)); - return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17)); - } - throw new NoSuchElementException(); - } - } - } - - @SneakyThrows @Override public List getUrlDetailsMulti(List> ids) { @@ -193,16 +103,38 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { List result = new ArrayList<>(ids.size()); try (var connection = dataSource.getConnection()) { - // This is SQL-injection safe, the IDs are of type int - String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")")); - try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) { + String idString = idList(ids); + + try (var stmt = connection.prepareStatement( + """ + SELECT ID, URL, + TITLE, DESCRIPTION, + WORDS_TOTAL, FORMAT, FEATURES, + IP, DOMAIN_STATE, DATA_HASH + FROM EC_URL_VIEW WHERE ID IN + """ + idString)) { +// "SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) { stmt.setFetchSize(ids.size()); var rsp = stmt.executeQuery(); while (rsp.next()) { - EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5)); - var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17)); + EdgeUrl url = new EdgeUrl(rsp.getString(2)); + var val = new EdgeUrlDetails(rsp.getInt(1), url, + rsp.getString(3), // title + rsp.getString(4), // description + -5, // quality + rsp.getInt(5), // wordsTotal + rsp.getString(6), // foramt + rsp.getInt(7), // features + rsp.getString(8), // ip + EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState + rsp.getInt(10), // dataHash + EdgePageScoreAdjustment.zero(), // urlQualityAdjustment + Integer.MAX_VALUE, // rankingId + Double.MAX_VALUE, // termScore + 0 // queryLength + ); if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) { result.add(val); } @@ -214,75 +146,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { return result; } - @Override - public List getDomainNeighbors(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { - final Set domains = new HashSet<>(count*3); - - final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?"; - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement(q)) { - stmt.setFetchSize(count); - stmt.setInt(1, domainId.getId()); - stmt.setInt(2, count); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); - } - } - } - - final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?"; - try (var stmt = connection.prepareStatement(q2)) { - - stmt.setFetchSize(count); - stmt.setInt(1, domainId.getId()); - stmt.setInt(2, count); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); - } - } - } - - final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?"; - try (var stmt = connection.prepareStatement(q3)) { - stmt.setFetchSize(count); - stmt.setInt(1, domainId.getId()); - stmt.setInt(2, count); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); - } - } - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - - - return new ArrayList<>(domains); - } - @Override public List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { @@ -357,7 +220,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { if (domains.size() < count/2) { final String q3 = """ - SELECT EC_DOMAIN.ID, URL_PART + SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID @@ -399,7 +262,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { @Override public List getRandomDomains(int count, EdgeDomainBlacklist blacklist) { - final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?"; + final String q = "SELECT DOMAIN_ID,DOMAIN_NAME FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?"; List domains = new ArrayList<>(count); try (var conn = dataSource.getConnection()) { try (var stmt = conn.prepareStatement(q)) { @@ -428,7 +291,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { public EdgeDomain getDomain(EdgeId id) { try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) { + try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { stmt.setInt(1, id.getId()); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -439,55 +302,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - @Override @SneakyThrows - public List> inboudUrls(EdgeId id, int limit) { - - List> ret = new ArrayList<>(); - try (var connection = dataSource.getConnection()) { - - try (var stmt = - connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) { - stmt.setFetchSize(limit); - stmt.setInt(1, id.getId()); - stmt.setInt(2, limit); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - ret.add(new EdgeId<>(rsp.getInt(1))); - } - } - - } - - return ret; - } - - - @Override @SneakyThrows - public List> outboundUrls(EdgeId id, int limit) { - - List> ret = new ArrayList<>(); - try (var connection = dataSource.getConnection()) { - - try (var stmt = - connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) { - stmt.setFetchSize(limit); - stmt.setInt(1, id.getId()); - stmt.setInt(2, limit); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - ret.add(new EdgeId<>(rsp.getInt(1))); - } - } - - } - - return ret; - } @Override public Optional> resolveAmbiguousDomain(String name) { try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { stmt.setString(1, name); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -495,7 +314,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { stmt.setString(1, "https://"+name); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -503,7 +322,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { stmt.setString(1, "http://"+name); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -511,7 +330,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { stmt.setString(1, "https://www."+name); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -519,7 +338,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { stmt.setString(1, "http://www."+name); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -682,27 +501,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { return Collections.emptyList(); } - @Override - public List getNewUrls(EdgeId domainId, Collection links) { - Map edgeUrlByPath = links.stream().collect(Collectors.toMap(EdgeUrl::getPath, Function.identity(), (a,b)->a)); - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT URL FROM EC_URL WHERE DOMAIN_ID=?")) { - stmt.setFetchSize(500); - stmt.setInt(1, domainId.getId()); - var rs = stmt.executeQuery(); - while (rs.next()) { - edgeUrlByPath.remove(rs.getString(1)); - } - } - } - catch (Exception ex) { - return Collections.emptyList(); - } - return new ArrayList<>(edgeUrlByPath.values()); - - } - @Override public double getRank(EdgeId domainId) { try (var connection = dataSource.getConnection()) { @@ -722,47 +520,5 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { return 1; } - @Override - public void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed) { - try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement("UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=?, DOMAIN_ALIAS=?, INDEXED=GREATEST(INDEXED,?) WHERE ID=?")) { - stmt.setInt(1, state.code); - if (null == alias) { - stmt.setNull(2, Types.INTEGER); - } - else { - stmt.setInt(2, getDomainId(alias).getId()); - } - - stmt.setInt(3, minIndexed); - stmt.setInt(4, getDomainId(domain).getId()); - stmt.executeUpdate(); - connection.commit(); - } - catch (SQLException throwables) { - logger.error("SQL error", throwables); - } - } - - @SneakyThrows - private double getDomainQuality(Connection connection, EdgeDomain src) { - try (var stmt = connection.prepareStatement("SELECT QUALITY_RAW FROM EC_DOMAIN WHERE URL_PART=?")) { - stmt.setString(1, src.toString()); - var res = stmt.executeQuery(); - - if (res.next()) { - var q = res.getDouble(1); - if (q > 0.5) { - logger.warn("gDQ({}) -> 1", src); - } - return 0; - } - } - catch (SQLException ex) { - logger.error("DB error", ex); - } - - return -5; - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java index f4cbb8d0..334ec5a9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java @@ -50,7 +50,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist { final TIntHashSet result = new TIntHashSet(1_000_000); try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) { + try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) { stmt.setFetchSize(1000); var rsp = stmt.executeQuery(); while (rsp.next()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java index 615fbc34..c42fcf53 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java @@ -30,29 +30,13 @@ public class SearchIndexDao { logger.info("SearchIndexDao ranking settings = {}", rankingSettings); } - @SneakyThrows - public TIntHashSet getSpamDomains() { - final TIntHashSet result = new TIntHashSet(1_000_000); - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) { - var rsp = stmt.executeQuery(); - while (rsp.next()) { - result.add(rsp.getInt(1)); - } - } - } - - return result; - } - @SneakyThrows public TIntHashSet goodUrls() { TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1); TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1); try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND STATE>=0")) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) { stmt.setFetchSize(10_000); var rsp = stmt.executeQuery(); while (rsp.next()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java index ed5fd013..d46aa79e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java @@ -16,25 +16,24 @@ public class EdgeUrlDetails { public String description; public double urlQuality; - public double urlQualityRaw; - public double domainQuality; - public int links; // DEAD public int words; public String format; public int features; - public EdgePageScoreAdjustment urlQualityAdjustment; - public long rankingId; - public double termScore; public String ip; // BROKEN - public int domainState; - public int queryLength; + public EdgeDomainIndexingState domainState; + public int dataHash; + public EdgePageScoreAdjustment urlQualityAdjustment; + public long rankingId; + public double termScore; + public int queryLength; + public long rankingIdAdjustment() { int penalty = 0; @@ -136,7 +135,7 @@ public class EdgeUrlDetails { return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES); } public boolean isSpecialDomain() { - return domainState == EdgeDomainIndexingState.SPECIAL.code; + return domainState == EdgeDomainIndexingState.SPECIAL; } public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java index 487e1556..22b24aca 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java @@ -107,7 +107,7 @@ public class SearchResultDecorator { private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) { return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength) - + ((details.domainState == EdgeDomainIndexingState.SPECIAL.code) ? 1.25 : 0); + + ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0); } } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index fc9e515d..6c99eccf 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -1,24 +1,11 @@ -DROP TABLE IF EXISTS EC_URL_LINK; -DROP VIEW IF EXISTS EC_PAGE_VIEW; - -DROP TABLE IF EXISTS DISC_DOMAIN_TAG; -DROP TABLE IF EXISTS DISC_TAG; -DROP TABLE IF EXISTS DISC_USER; - -DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS; +DROP TABLE IF EXISTS DOMAIN_METADATA; DROP TABLE IF EXISTS EC_FEED_URL; DROP TABLE IF EXISTS EC_DOMAIN_LINK; DROP TABLE IF EXISTS EC_PAGE_DATA; DROP TABLE IF EXISTS EC_URL; +DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS; DROP TABLE IF EXISTS EC_DOMAIN; -DROP TABLE IF EXISTS EC_TOP_DOMAIN; -DROP TABLE IF EXISTS EC_URL_DETAILS; -DROP VIEW IF EXISTS EC_URL_VIEW; -DROP VIEW IF EXISTS EC_URL_PART_HASH; -DROP TABLE IF EXISTS EC_URL_WORD; -DROP TABLE IF EXISTS EC_DICTIONARY; -DROP TABLE IF EXISTS DOMAIN_METADATA; CREATE TABLE IF NOT EXISTS DOMAIN_METADATA ( ID INT PRIMARY KEY, @@ -27,52 +14,31 @@ CREATE TABLE IF NOT EXISTS DOMAIN_METADATA ( GOOD_URLS INT DEFAULT 0 ); -CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN ( - ID INT PRIMARY KEY AUTO_INCREMENT, - URL_PART VARCHAR(255) UNIQUE NOT NULL, - ALIVE BOOLEAN DEFAULT TRUE NOT NULL -) -CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_DOMAIN ( ID INT PRIMARY KEY AUTO_INCREMENT, - URL_PART VARCHAR(255) UNIQUE NOT NULL, - INDEXED INT DEFAULT 0 NOT NULL, - QUALITY DOUBLE DEFAULT -5 NOT NULL, - QUALITY_RAW DOUBLE DEFAULT -5 NOT NULL, - QUALITY_ORIGINAL DOUBLE DEFAULT -5 NOT NULL, - URL_TOP_DOMAIN_ID INT NOT NULL, - URL_SUBDOMAIN VARCHAR(255) NOT NULL, - STATE INT DEFAULT 0 NOT NULL, + DOMAIN_NAME VARCHAR(255) UNIQUE NOT NULL, + DOMAIN_TOP VARCHAR(255) NOT NULL, + + INDEXED INT DEFAULT 0 NOT NULL COMMENT "~number of documents visited / 100", + STATE ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN') NOT NULL DEFAULT 'active' COMMENT "@see EdgeDomainIndexingState", RANK DOUBLE, - DOMAIN_ALIAS INTEGER, + IP VARCHAR(32), INDEX_DATE TIMESTAMP DEFAULT NOW(), DISCOVER_DATE TIMESTAMP DEFAULT NOW(), - FOREIGN KEY (URL_TOP_DOMAIN_ID) REFERENCES EC_TOP_DOMAIN(ID) ON DELETE CASCADE -) -CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; - -CREATE TABLE IF NOT EXISTS EC_DOMAIN_HISTORY ( - ID INT PRIMARY KEY AUTO_INCREMENT, - URL_PART VARCHAR(255) UNIQUE NOT NULL, - QUALITY_MEASURE DOUBLE DEFAULT -5 NOT NULL, - INBOUND_LINKS INT DEFAULT 1, - LINK_ADJUSTED_QUALITY DOUBLE GENERATED ALWAYS AS (0.3*QUALITY_MEASURE + 0.7*QUALITY_MEASURE / GREATEST(1, INBOUND_LINKS)), - RANK DOUBLE + IS_ALIVE BOOLEAN AS (STATE='ACTIVE' OR STATE='EXHAUSTED' OR STATE='SPECIAL' OR STATE='SOCIAL_MEDIA') VIRTUAL ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_DOMAIN_BLACKLIST ( ID INT PRIMARY KEY AUTO_INCREMENT, - URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL + URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; @@ -81,18 +47,15 @@ CREATE TABLE IF NOT EXISTS EC_URL ( ID INT PRIMARY KEY AUTO_INCREMENT, DOMAIN_ID INT NOT NULL, PROTO ENUM('http','https','gemini') NOT NULL, - URL VARCHAR(255) NOT NULL, + PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin, PORT INT, + PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", VISITED BOOLEAN NOT NULL DEFAULT FALSE, - DATA_HASH INTEGER, - QUALITY_MEASURE DOUBLE, STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', - IP VARCHAR(32), - - CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL), + CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH), FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 @@ -101,13 +64,14 @@ COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( ID INT PRIMARY KEY AUTO_INCREMENT, - TITLE VARCHAR(255), - DESCRIPTION VARCHAR(255), + TITLE VARCHAR(255) NOT NULL, + DESCRIPTION VARCHAR(255) NOT NULL, - WORDS_DISTINCT INTEGER, - WORDS_TOTAL INTEGER, - FORMAT VARCHAR(8), - FEATURES INT, + WORDS_TOTAL INTEGER NOT NULL, + FORMAT ENUM('PLAIN', 'UNKNOWN', 'HTML123', 'HTML4', 'XHTML', 'HTML5', 'MARKDOWN') NOT NULL, + FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL, + + DATA_HASH INTEGER NOT NULL, FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE ) @@ -115,13 +79,9 @@ CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; CREATE TABLE EC_FEED_URL ( - ID INT PRIMARY KEY AUTO_INCREMENT, - DOMAIN_ID INT NOT NULL, - PROTO VARCHAR(8) NOT NULL, - URL VARCHAR(255) NOT NULL, - PORT INT, + URL VARCHAR(255) PRIMARY KEY, + DOMAIN_ID INT, - CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL), FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 @@ -150,29 +110,23 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK ( FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ); -CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK_AGGREGATE ( - DOMAIN_ID INT PRIMARY KEY NOT NULL, - LINKS INT -); - CREATE OR REPLACE VIEW EC_URL_VIEW AS SELECT - EC_DOMAIN.URL_PART AS URL_DOMAIN, - EC_URL.URL AS URL_PATH, - EC_TOP_DOMAIN.URL_PART AS URL_TOP, + IF(PORT IS NULL, + CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH), + CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH)) + AS URL, + EC_URL.PATH_HASH AS PATH_HASH, + EC_URL.PATH AS PATH, + EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME, + EC_DOMAIN.DOMAIN_TOP AS DOMAIN_TOP, EC_URL.ID AS ID, EC_DOMAIN.ID AS DOMAIN_ID, - EC_TOP_DOMAIN.ID AS TOP_DOMAIN_ID, - EC_URL.PROTO AS URL_PROTO, - EC_URL.PORT AS URL_PORT, EC_URL.VISITED AS VISITED, - EC_URL.DATA_HASH AS DATA_HASH, - EC_URL.QUALITY_MEASURE AS URL_QUALITY_MEASURE, - EC_DOMAIN.QUALITY AS DOMAIN_QUALITY_MEASURE, - EC_DOMAIN.QUALITY_RAW AS QUALITY_RAW, + EC_PAGE_DATA.DATA_HASH AS DATA_HASH, EC_PAGE_DATA.TITLE AS TITLE, EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION, - EC_URL.IP AS IP, + EC_DOMAIN.IP AS IP, EC_DOMAIN.STATE AS STATE, EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL, EC_PAGE_DATA.FORMAT AS FORMAT, @@ -183,59 +137,32 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS LEFT JOIN EC_PAGE_DATA ON EC_PAGE_DATA.ID = EC_URL.ID INNER JOIN EC_DOMAIN - ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID - INNER JOIN EC_TOP_DOMAIN - ON EC_DOMAIN.URL_TOP_DOMAIN_ID=EC_TOP_DOMAIN.ID; - -CREATE OR REPLACE VIEW EC_DISCOVER_TASKS_VIEW AS - SELECT - ID, - URL_PART - FROM EC_DOMAIN - WHERE - DOMAIN_ALIAS IS NULL - AND INDEXED = 0 - ORDER BY QUALITY DESC, ID ASC; + ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID; CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS SELECT SOURCE_DOMAIN_ID, - SOURCE_DOMAIN.URL_PART AS SOURCE_URL, - SOURCE_TOP_DOMAIN.URL_PART AS SOURCE_TOP_URL, + SOURCE_DOMAIN.DOMAIN_NAME AS SOURCE_DOMAIN, + SOURCE_DOMAIN.DOMAIN_TOP AS SOURCE_TOP_DOMAIN, DEST_DOMAIN_ID, - DEST_DOMAIN.URL_PART AS DEST_URL, - DEST_TOP_DOMAIN.URL_PART AS DEST_TOP_URL + DEST_DOMAIN.DOMAIN_NAME AS DEST_DOMAIN, + DEST_DOMAIN.DOMAIN_TOP AS DEST_TOP_DOMAIN FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID - INNER JOIN EC_TOP_DOMAIN AS SOURCE_TOP_DOMAIN - ON SOURCE_TOP_DOMAIN.ID=SOURCE_DOMAIN.URL_TOP_DOMAIN_ID INNER JOIN EC_DOMAIN AS DEST_DOMAIN ON DEST_DOMAIN.ID=DEST_DOMAIN_ID - INNER JOIN EC_TOP_DOMAIN AS DEST_TOP_DOMAIN - ON DEST_TOP_DOMAIN.ID=DEST_DOMAIN.URL_TOP_DOMAIN_ID ; CREATE OR REPLACE VIEW EC_RELATED_LINKS_IN AS SELECT IN_URL.ID AS SRC_URL_ID, - IN_URL.QUALITY_MEASURE AS SRC_URL_QUALITY, - OUT_URL.ID AS DEST_URL_ID, - OUT_URL.QUALITY_MEASURE AS DEST_URL_QUALITY - FROM EC_URL AS IN_URL - INNER JOIN EC_DOMAIN_LINK - ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID - INNER JOIN EC_URL AS OUT_URL - ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID - WHERE IN_URL.VISITED=TRUE - AND IN_URL.DATA_HASH IS NOT NULL - AND OUT_URL.VISITED=TRUE - AND OUT_URL.DATA_HASH IS NOT NULL; - -CREATE TABLE IF NOT EXISTS EC_DOMAIN_BACKLINKS ( - ID INT PRIMARY KEY, - LINKEDNESS INT -); + OUT_URL.ID AS DEST_URL_ID + FROM EC_DOMAIN_LINK + INNER JOIN EC_URL AS IN_URL ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID + INNER JOIN EC_URL AS OUT_URL ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID + WHERE IN_URL.VISITED AND IN_URL.STATE = 'ok' + AND OUT_URL.VISITED AND OUT_URL.STATE = 'ok'; CREATE TABLE IF NOT EXISTS EC_API_KEY ( LICENSE_KEY VARCHAR(255) UNIQUE, @@ -245,16 +172,9 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY ( RATE INT DEFAULT 10 ); -CREATE INDEX IF NOT EXISTS EC_DOMAIN_RANK_INDEX ON EC_DOMAIN (RANK); -CREATE INDEX IF NOT EXISTS EC_DOMAIN_QUALITY_INDEX ON EC_DOMAIN (QUALITY,STATE); - CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED); -CREATE INDEX IF NOT EXISTS EC_DOMAIN_ID_INDEXED_INDEX ON EC_DOMAIN (ID, INDEXED); -CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, INDEXED, QUALITY); - CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED); -CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE); -CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP); +CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP); ---; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java index 84b9f165..26d397a8 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java @@ -43,7 +43,7 @@ public class TestUtil { logger.info("Running script {}", scriptFile); try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile); var stmt = conn.createStatement()) { - for (String s : new String(scriptStream.readAllBytes()).split(";")) { + for (String s : new String(scriptStream.readAllBytes()).split("(;|---)")) { if (!s.isBlank()) { try { Assertions.assertTrue(stmt.executeUpdate(s) >= 0); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java new file mode 100644 index 00000000..639f5d72 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java @@ -0,0 +1,51 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; +import org.junit.jupiter.api.parallel.ResourceAccessMode; +import org.junit.jupiter.api.parallel.ResourceLock; + +import java.net.URISyntaxException; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) +@Execution(ExecutionMode.SAME_THREAD) +@Tag("db") +class SqlLoadDomainLinksTest { + + HikariDataSource dataSource; + LoaderData loaderData; + @BeforeEach + public void setUp() { + dataSource = TestUtil.getConnection(); + TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + + var loadDomains = new SqlLoadDomains(dataSource); + loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); + } + + @AfterEach + public void tearDown() { + dataSource.close(); + } + + @Test + public void loadDomainLinks() throws URISyntaxException { + var loader = new SqlLoadDomainLinks(dataSource); + loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) }); + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java new file mode 100644 index 00000000..78ab9866 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java @@ -0,0 +1,54 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; +import org.junit.jupiter.api.parallel.ResourceAccessMode; +import org.junit.jupiter.api.parallel.ResourceLock; + +import static org.junit.jupiter.api.Assertions.*; + +@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) +@Execution(ExecutionMode.SAME_THREAD) +@Tag("db") +class SqlLoadDomainsTest { + + + @Test + public void loadDomain() { + + try (var dataSource = TestUtil.getConnection()) { + TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + + var loadDomains = new SqlLoadDomains(dataSource); + var loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); + + assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0); + } + + } + + @Test + public void loadDomains() { + + try (var dataSource = TestUtil.getConnection()) { + TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + + var loadDomains = new SqlLoadDomains(dataSource); + var loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); + + assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0); + assertTrue(loaderData.getDomainId(new EdgeDomain("memex.marginalia.nu")) >= 0); + } + + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java new file mode 100644 index 00000000..e82309c3 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java @@ -0,0 +1,68 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; +import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; +import org.junit.jupiter.api.parallel.ResourceAccessMode; +import org.junit.jupiter.api.parallel.ResourceLock; + +import java.net.URISyntaxException; +import java.util.List; +import java.util.Set; + +@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) +@Execution(ExecutionMode.SAME_THREAD) +@Tag("db") +class SqlLoadProcessedDocumentTest { + HikariDataSource dataSource; + LoaderData loaderData; + @BeforeEach + public void setUp() throws URISyntaxException { + dataSource = TestUtil.getConnection(); + TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + + var loadDomains = new SqlLoadDomains(dataSource); + var loadUrls = new SqlLoadUrls(dataSource); + + loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); + + loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")}); + + } + + @AfterEach + public void tearDown() { + dataSource.close(); + } + + @Test + public void loadProcessedDocument() throws URISyntaxException { + var loader = new SqlLoadProcessedDocument(dataSource); + loader.load(loaderData, List.of(new LoadProcessedDocument( + new EdgeUrl("https://www.marginalia.nu/"), + EdgeUrlState.OK, + "TITLE", + "DESCR", + HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)), + EdgeHtmlStandard.HTML5, + 100, + 12345, + -5 + ))); + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java new file mode 100644 index 00000000..805cf9c6 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java @@ -0,0 +1,52 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; +import org.junit.jupiter.api.parallel.ResourceAccessMode; +import org.junit.jupiter.api.parallel.ResourceLock; + +import java.net.URISyntaxException; + +@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) +@Execution(ExecutionMode.SAME_THREAD) +@Tag("db") +class SqlLoadProcessedDomainTest { + HikariDataSource dataSource; + LoaderData loaderData; + @BeforeEach + public void setUp() { + dataSource = TestUtil.getConnection(); + TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + + var loadDomains = new SqlLoadDomains(dataSource); + loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain[]{ new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); + } + + @AfterEach + public void tearDown() { + dataSource.close(); + } + + @Test + public void loadProcessedDomain() { + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1"); + } + @Test + public void loadDomainAlias() { + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu"))); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java new file mode 100644 index 00000000..c8558357 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java @@ -0,0 +1,49 @@ +package nu.marginalia.wmsa.edge.converting.loader; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; +import org.junit.jupiter.api.parallel.ResourceAccessMode; +import org.junit.jupiter.api.parallel.ResourceLock; + +import java.net.URISyntaxException; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) +@Execution(ExecutionMode.SAME_THREAD) +@Tag("db") +class SqlLoadUrlsTest { + HikariDataSource dataSource; + LoaderData loaderData; + @BeforeEach + public void setUp() { + dataSource = TestUtil.getConnection(); + TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + + var loadDomains = new SqlLoadDomains(dataSource); + loaderData = new LoaderData(10); + + loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); + loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); + } + + @AfterEach + public void tearDown() { + dataSource.close(); + } + + @Test + public void loadUrl() throws URISyntaxException { + var loadUrls = new SqlLoadUrls(dataSource); + loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") }); + } + +} \ No newline at end of file From 026ba714b5359c7af400fd24dbd70d8e84c93481 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 8 Jun 2022 15:32:03 +0200 Subject: [PATCH 15/27] WIP: Database refactoring --- marginalia_nu/build.gradle | 31 ++++--- .../loader/SqlLoadDomainLinksTest.java | 29 +++--- .../converting/loader/SqlLoadDomainsTest.java | 28 +++--- .../loader/SqlLoadProcessedDocumentTest.java | 47 +++++++--- .../loader/SqlLoadProcessedDomainTest.java | 26 +++--- .../converting/loader/SqlLoadUrlsTest.java | 25 +++--- .../service/SearchIndexConverterTest.java | 89 ------------------- 7 files changed, 106 insertions(+), 169 deletions(-) delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index b2115fb0..eb553649 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -59,12 +59,12 @@ dependencies { implementation "com.sparkjava:spark-core:2.9.3" implementation 'com.opencsv:opencsv:5.6' - implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' + implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2' implementation 'org.slf4j:slf4j-api:1.7.36' @@ -76,7 +76,6 @@ dependencies { implementation 'com.github.ThatJavaNerd:JRAW:1.1.0' implementation group: 'com.h2database', name: 'h2', version: '2.1.210' - testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.3.1' implementation 'org.jsoup:jsoup:1.14.3' implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2' @@ -86,7 +85,7 @@ dependencies { implementation 'com.zaxxer:HikariCP:5.0.1' - implementation 'org.apache.opennlp:opennlp-tools:1.9.3' + implementation 'org.apache.opennlp:opennlp-tools:1.9.4' implementation 'io.prometheus:simpleclient:0.15.0' implementation 'io.prometheus:simpleclient_servlet:0.15.0' implementation 'io.prometheus:simpleclient_httpserver:0.15.0' @@ -123,15 +122,19 @@ dependencies { testImplementation 'org.projectlombok:lombok:1.18.24' testAnnotationProcessor 'org.projectlombok:lombok:1.18.24' + testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1' + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2') + testImplementation 'org.testcontainers:mariadb:1.17.2' + testImplementation "org.testcontainers:junit-jupiter:1.17.2" + e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' e2eTestImplementation 'org.projectlombok:lombok:1.18.24' - e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22' - e2eTestImplementation 'org.testcontainers:mariadb:1.17.1' - e2eTestImplementation 'org.testcontainers:nginx:1.17.1' - e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1' - e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1" - e2eTestImplementation "org.testcontainers:selenium:1.17.1" + e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24' + e2eTestImplementation 'org.testcontainers:nginx:1.17.2' + e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2" + e2eTestImplementation "org.testcontainers:selenium:1.17.2" e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4' e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4' } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java index 639f5d72..d839bbb2 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java @@ -4,31 +4,28 @@ import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.util.TestUtil; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.parallel.Execution; -import org.junit.jupiter.api.parallel.ExecutionMode; -import org.junit.jupiter.api.parallel.ResourceAccessMode; -import org.junit.jupiter.api.parallel.ResourceLock; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; -import java.net.URISyntaxException; - -import static org.junit.jupiter.api.Assertions.assertTrue; - -@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) -@Execution(ExecutionMode.SAME_THREAD) -@Tag("db") +@Testcontainers class SqlLoadDomainLinksTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); HikariDataSource dataSource; LoaderData loaderData; @BeforeEach public void setUp() { - dataSource = TestUtil.getConnection(); - TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); var loadDomains = new SqlLoadDomains(dataSource); loaderData = new LoaderData(10); @@ -43,7 +40,7 @@ class SqlLoadDomainLinksTest { } @Test - public void loadDomainLinks() throws URISyntaxException { + public void loadDomainLinks() { var loader = new SqlLoadDomainLinks(dataSource); loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) }); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java index 78ab9866..25dd18b4 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java @@ -2,27 +2,27 @@ package nu.marginalia.wmsa.edge.converting.loader; import nu.marginalia.util.TestUtil; import nu.marginalia.wmsa.edge.model.EdgeDomain; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.parallel.Execution; -import org.junit.jupiter.api.parallel.ExecutionMode; -import org.junit.jupiter.api.parallel.ResourceAccessMode; -import org.junit.jupiter.api.parallel.ResourceLock; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import static org.junit.jupiter.api.Assertions.*; -@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) -@Execution(ExecutionMode.SAME_THREAD) -@Tag("db") +@Testcontainers class SqlLoadDomainsTest { - + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); @Test public void loadDomain() { - try (var dataSource = TestUtil.getConnection()) { - TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); - + try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { var loadDomains = new SqlLoadDomains(dataSource); var loaderData = new LoaderData(10); @@ -37,9 +37,7 @@ class SqlLoadDomainsTest { @Test public void loadDomains() { - try (var dataSource = TestUtil.getConnection()) { - TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); - + try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { var loadDomains = new SqlLoadDomains(dataSource); var loaderData = new LoaderData(10); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java index e82309c3..e81e44e3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java @@ -4,33 +4,44 @@ import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.util.TestUtil; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; +import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.parallel.Execution; -import org.junit.jupiter.api.parallel.ExecutionMode; -import org.junit.jupiter.api.parallel.ResourceAccessMode; -import org.junit.jupiter.api.parallel.ResourceLock; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.net.URISyntaxException; import java.util.List; import java.util.Set; -@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) -@Execution(ExecutionMode.SAME_THREAD) -@Tag("db") +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +@Testcontainers class SqlLoadProcessedDocumentTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); + HikariDataSource dataSource; LoaderData loaderData; + EdgeDataStoreDaoImpl dataStoreDao; + @BeforeEach public void setUp() throws URISyntaxException { - dataSource = TestUtil.getConnection(); - TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + dataStoreDao = new EdgeDataStoreDaoImpl(dataSource); var loadDomains = new SqlLoadDomains(dataSource); var loadUrls = new SqlLoadUrls(dataSource); @@ -46,14 +57,17 @@ class SqlLoadProcessedDocumentTest { @AfterEach public void tearDown() { + dataStoreDao.clearCaches(); dataSource.close(); } @Test public void loadProcessedDocument() throws URISyntaxException { var loader = new SqlLoadProcessedDocument(dataSource); + var url = new EdgeUrl("https://www.marginalia.nu/"); + loader.load(loaderData, List.of(new LoadProcessedDocument( - new EdgeUrl("https://www.marginalia.nu/"), + url, EdgeUrlState.OK, "TITLE", "DESCR", @@ -63,6 +77,17 @@ class SqlLoadProcessedDocumentTest { 12345, -5 ))); + + var details = dataStoreDao.getUrlDetailsMulti(List.of(new EdgeId<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/"))))); + assertEquals(1, details.size()); + + var urlDetails = details.get(0); + + assertEquals("TITLE", urlDetails.getTitle()); + assertEquals("DESCR", urlDetails.getDescription()); + assertTrue(urlDetails.isAffiliate()); + assertEquals(100, urlDetails.words); + assertEquals(12345, urlDetails.dataHash); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java index 805cf9c6..eb66da92 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java @@ -7,25 +7,27 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.parallel.Execution; -import org.junit.jupiter.api.parallel.ExecutionMode; -import org.junit.jupiter.api.parallel.ResourceAccessMode; -import org.junit.jupiter.api.parallel.ResourceLock; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; -import java.net.URISyntaxException; - -@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) -@Execution(ExecutionMode.SAME_THREAD) -@Tag("db") +@Testcontainers class SqlLoadProcessedDomainTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); + HikariDataSource dataSource; LoaderData loaderData; @BeforeEach public void setUp() { - dataSource = TestUtil.getConnection(); - TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + + dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); var loadDomains = new SqlLoadDomains(dataSource); loaderData = new LoaderData(10); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java index c8558357..5afac733 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java @@ -6,27 +6,28 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.parallel.Execution; -import org.junit.jupiter.api.parallel.ExecutionMode; -import org.junit.jupiter.api.parallel.ResourceAccessMode; -import org.junit.jupiter.api.parallel.ResourceLock; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.net.URISyntaxException; -import static org.junit.jupiter.api.Assertions.assertTrue; - -@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) -@Execution(ExecutionMode.SAME_THREAD) -@Tag("db") +@Testcontainers class SqlLoadUrlsTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetworkAliases("mariadb"); + HikariDataSource dataSource; LoaderData loaderData; @BeforeEach public void setUp() { - dataSource = TestUtil.getConnection(); - TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql"); + dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); var loadDomains = new SqlLoadDomains(dataSource); loaderData = new LoaderData(10); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java deleted file mode 100644 index f42f2d36..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java +++ /dev/null @@ -1,89 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; - -class SearchIndexConverterTest { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Test @Disabled @SneakyThrows - public void test() { - // File dictFile = new File("/home/vlofgren/dictionary.dat"); - File inFile = new File("/home/vlofgren/Work/converter/3/page-index.dat"); - - new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"), inFile, - new File("/home/vlofgren/Work/converter/words.dat"), - new File("/home/vlofgren/Work/converter/urls.dat"), new SearchIndexPartitioner(null), val -> false); - - // sanityCheck(); - } - - @Test @Disabled - public void sanityCheck() { - File inFile = new File("/home/vlofgren/write/6/page-index.dat"); - -// SearchIndexReader sir = new SearchIndexReader(new SearchIndex[]{ -// new SearchIndex("body", Path.of("/tmp"), -// new File("/home/vlofgren/data/urls.dat"), -// new File("/home/vlofgren/data/words.dat")), -// new SearchIndex("body", Path.of("/tmp"), -// new File("/home/vlofgren/data/urls.dat"), -// new File("/home/vlofgren/data/words.dat")) -// , -// new SearchIndex("body", Path.of("/tmp"), -// new File("/home/vlofgren/data/urls.dat"), -// new File("/home/vlofgren/data/words.dat")) -// , -// new SearchIndex("body", Path.of("/tmp"), -// new File("/home/vlofgren/data/urls.dat"), -// new File("/home/vlofgren/data/words.dat")) -// }); - -// getQuery(sir, new EdgeIndexSearchTerms(List.of(152, 106), Collections.emptyList())).stream().forEach(System.out::println); -// sir.findWord(152).also(106).stream().forEach(System.out::println); -// scanFile(inFile, (url, word) -> { -// //System.out.println(url + " " + word); -// if (!sir.findWord(word).stream().anyMatch(url::equals)) { -// logger.error("Can't find word {} in {}", word, url); -// } -// }); - - - } -/* - private SearchIndexReader.Query getQuery(SearchIndexReader indexReader, EdgeIndexSearchTerms searchTerms) { - var orderedIncludes = searchTerms.includes - .stream() - .sorted(Comparator.comparingLong(indexReader::numHits)) - .distinct() - .mapToInt(Integer::intValue) - .toArray(); - - logger.info("Includes: ({}); excludes: ({})", Arrays. - stream(orderedIncludes) - .mapToObj(String::valueOf) - .collect(Collectors.joining(",")), - searchTerms.excludes.stream().map(String::valueOf).collect(Collectors.joining(","))); - SearchIndexReader.Query query = indexReader.findWord(orderedIncludes[0]); - for (int i = 1; i < orderedIncludes.length; i++) { - query = query.also(orderedIncludes[i]); - } - for (int term : searchTerms.excludes) { - query = query.not(term); - } - return query; - } - -*/ -} \ No newline at end of file From 5e472fe121fcf42de70c3286a1a191686382e4ca Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 8 Jun 2022 16:18:00 +0200 Subject: [PATCH 16/27] WIP: Refactored ranking algorithms to separate database code from ranking code --- .../util/ranking/BetterReversePageRank.java | 8 +- .../util/ranking/BetterStandardPageRank.java | 8 +- .../util/ranking/BuggyReversePageRank.java | 8 +- .../util/ranking/BuggyStandardPageRank.java | 8 +- .../util/ranking/RankingAlgorithm.java | 256 +++++------------ .../util/ranking/RankingDomainData.java | 33 +++ .../util/ranking/RankingDomainFetcher.java | 105 +++++++ .../ranking/tool/UpdateDomainRanksTool.java | 9 +- .../ranking/tool/UpdateDomainRanksTool2.java | 9 +- .../CrawlJobExtractorPageRankMain.java | 7 +- .../wmsa/edge/data/dao/EdgeDataStoreDao.java | 21 +- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 270 ++---------------- .../edge/index/service/SearchIndexDao.java | 23 +- .../wmsa/edge/model/EdgeDomain.java | 5 +- .../siteinfo/DomainInformationService.java | 225 +++++++++++++-- .../wmsa/edge/tools/IndexMergerMain.java | 5 +- 16 files changed, 488 insertions(+), 512 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java index f2889ad6..7d3b17c4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java @@ -1,15 +1,11 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BetterReversePageRank extends RankingAlgorithm { - public BetterReversePageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BetterReversePageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java index 5b64fa73..f1f9b0b1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java @@ -1,14 +1,10 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BetterStandardPageRank extends RankingAlgorithm { - public BetterStandardPageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java index 1e87776c..485ba353 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java @@ -1,15 +1,11 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BuggyReversePageRank extends RankingAlgorithm { - public BuggyReversePageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java index a3d7b87e..836bcdfe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java @@ -1,14 +1,10 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BuggyStandardPageRank extends RankingAlgorithm { - public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java index b07285d4..875031f1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java @@ -1,35 +1,26 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; -import gnu.trove.set.hash.TIntHashSet; import it.unimi.dsi.fastutil.ints.IntComparator; -import lombok.AllArgsConstructor; -import lombok.Data; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.sql.SQLException; import java.util.*; import java.util.function.IntToDoubleFunction; import java.util.stream.IntStream; import it.unimi.dsi.fastutil.ints.IntArrays; public abstract class RankingAlgorithm { - final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); + final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); - private final TIntHashSet spamDomains; - private final HikariDataSource dataSource; - TIntArrayList[] linkDataSrc2Dest; TIntArrayList[] linkDataDest2Src; @@ -41,10 +32,14 @@ public abstract class RankingAlgorithm { private static final boolean getNames = true; private final Logger logger = LoggerFactory.getLogger(getClass()); + private RankingDomainFetcher domains; public static void main(String... args) throws IOException { - var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com"); - var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + + var rpr = new BuggyReversePageRank(domains, "wiki.xxiivv.com"); + var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu"); var rankVector = spr.pageRankVector(); var norm = rankVector.norm(); @@ -61,164 +56,97 @@ public abstract class RankingAlgorithm { return domainsById.get(id).peripheral; } - public RankingAlgorithm(HikariDataSource dataSource, String... origins) { - this.dataSource = dataSource; - var blacklist = new EdgeDomainBlacklistImpl(dataSource); + public RankingAlgorithm(RankingDomainFetcher domains, String... origins) { + this.domains = domains; - spamDomains = blacklist.getSpamDomains(); originDomains.addAll(Arrays.asList(origins)); - try (var conn = dataSource.getConnection()) { + domains.getDomains(domainData -> { + int id = domainData.id; - String s; - if (getNames) { - s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + domainsById.put(id, domainData); + + domainIndexToId.put(domainIndexToId.size(), id); + domainIdToIndex.put(id, domainIdToIndex.size()); + }); + + linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; + linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; + + domains.eachDomainLink((src, dst) -> { + if (src == dst) return; + + if (domainsById.contains(src) && domainsById.contains(dst)) { + + int srcIdx = domainIdToIndex.get(src); + int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); + + if (linkDataSrc2Dest[srcIdx] == null) { + linkDataSrc2Dest[srcIdx] = new TIntArrayList(); + } + linkDataSrc2Dest[srcIdx].add(dstIdx); + + if (linkDataDest2Src[dstIdx] == null) { + linkDataDest2Src[dstIdx] = new TIntArrayList(); + } + linkDataDest2Src[dstIdx].add(srcIdx); + } + }); + + for (var namePattern : this.originDomains) { + domains.domainsByPattern(namePattern, i -> { + int ival = domainIdToIndex.get(i); + if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) { + originDomainIds.add(ival); } else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + logger.debug("No value for {}", i); } - try (var stmt = conn.prepareStatement(s)) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - if (!spamDomains.contains(id)) { - - domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false)); - - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - } - } - } - - - linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; - linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; - - try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - if (src == dst) continue; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } - } - } - - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) { - for (var seed : this.originDomains) { - stmt.setString(1, seed); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int i = rsp.getInt(1); - int ival = domainIdToIndex.get(i); - if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) { - originDomainIds.add(ival); - } - else { - logger.debug("No value for {}", i); - } - } - logger.debug("{} -> {}", seed, originDomainIds.size()); - } - } - - logger.info("Origin Domains: {}", originDomainIds.size()); - - } catch (SQLException throwables) { - logger.error("SQL error", throwables); + }); } + logger.info("Origin Domains: {}", originDomainIds.size()); } - public void addPeripheralNodes(boolean includeErrorStates) { + public void addPeripheralNodes() { int newNodesIdxCutoff = domainIdToIndex.size(); logger.info("Inserting peripheral nodes"); - try (var conn = dataSource.getConnection()) { - String s; - if (getNames) { - s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + domains.getPeripheralDomains(domainData -> { + int id = domainData.id; + + if (domainsById.put(id, domainData) == null) { // true if id was not already present + domainIndexToId.put(domainIndexToId.size(), id); + domainIdToIndex.put(id, domainIdToIndex.size()); } - else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; - } - try (var stmt = conn.prepareStatement(s)) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); + }); - while (rsp.next()) { - int id = rsp.getInt(1); + linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size()); + linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size()); - if (!spamDomains.contains(id)) { - domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), true)); + domains.eachDomainLink((src, dst) -> { + if (src == dst) return; - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - } + if (domainsById.contains(src) && domainsById.contains(dst)) { + int srcIdx = domainIdToIndex.get(src); + int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); + + // This looks like a bug, but it improves the results + if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff) + return; + + if (linkDataSrc2Dest[srcIdx] == null) { + linkDataSrc2Dest[srcIdx] = new TIntArrayList(); } + linkDataSrc2Dest[srcIdx].add(dstIdx); - } - - linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size()); - linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size()); - - try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - if (src == dst) continue; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - // This looks like a bug, but it improves the results - if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff) - continue; - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } + if (linkDataDest2Src[dstIdx] == null) { + linkDataDest2Src[dstIdx] = new TIntArrayList(); } + linkDataDest2Src[dstIdx].add(srcIdx); } - } catch (SQLException throwables) { - logger.error("SQL error", throwables); - } + }); logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size()); } @@ -271,14 +199,14 @@ public abstract class RankingAlgorithm { return rank.getRanking(resultCount); } - public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) { + public TIntList pageRankWithPeripheralNodes(int resultCount) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; for (int i = 0; i < iter_max; i++) { if (i == iter_max-1) { - addPeripheralNodes(includeErrorStates); + addPeripheralNodes(); } RankVector newRank = createNewRankVector(rank); @@ -323,7 +251,7 @@ public abstract class RankingAlgorithm { abstract RankVector createNewRankVector(RankVector rank); - public boolean includeInRanking(DomainData data) { + public boolean includeInRanking(RankingDomainData data) { if (data.isAlias()) return false; if (data.isSpecial()) @@ -445,32 +373,4 @@ public abstract class RankingAlgorithm { } } - @Data - @AllArgsConstructor - static class DomainData { - public final int id; - public final String name; - private int alias; - private EdgeDomainIndexingState state; - public final int knownUrls; - public boolean peripheral; - - public int resolveAlias() { - if (alias == 0) return id; - return alias; - } - - public boolean isAlias() { - return alias != 0; - } - - public boolean isSpecial() { - return EdgeDomainIndexingState.SPECIAL == state; - } - - public boolean isSocialMedia() { - return EdgeDomainIndexingState.SOCIAL_MEDIA == state; - } - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java new file mode 100644 index 00000000..c29ed704 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java @@ -0,0 +1,33 @@ +package nu.marginalia.util.ranking; + +import lombok.AllArgsConstructor; +import lombok.Data; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; + +@Data +@AllArgsConstructor +class RankingDomainData { + public final int id; + public final String name; + private int alias; + private EdgeDomainIndexingState state; + public final int knownUrls; + public boolean peripheral; + + public int resolveAlias() { + if (alias == 0) return id; + return alias; + } + + public boolean isAlias() { + return alias != 0; + } + + public boolean isSpecial() { + return EdgeDomainIndexingState.SPECIAL == state; + } + + public boolean isSocialMedia() { + return EdgeDomainIndexingState.SOCIAL_MEDIA == state; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java new file mode 100644 index 00000000..79285a83 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java @@ -0,0 +1,105 @@ +package nu.marginalia.util.ranking; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.function.Consumer; +import java.util.function.IntConsumer; + +public class RankingDomainFetcher { + private final HikariDataSource dataSource; + private final EdgeDomainBlacklistImpl blacklist; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final boolean getNames = false; + + @Inject + public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { + this.dataSource = dataSource; + this.blacklist = blacklist; + } + + public void getDomains(Consumer consumer) { + String query; + if (getNames) { + query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + } + else { + query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + } + + getDomains(query, consumer); + } + + + public void getPeripheralDomains(Consumer consumer) { + String query; + if (getNames) { + query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + } + else { + query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + } + + getDomains(query, consumer); + } + + private void getDomains(String query, Consumer consumer) { + try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) { + stmt.setFetchSize(10000); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + int id = rsp.getInt(1); + if (!blacklist.isBlacklisted(id)) { + consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false)); + } + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domains", ex); + } + } + + public void eachDomainLink(DomainLinkConsumer consumer) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) + { + stmt.setFetchSize(10000); + + var rsp = stmt.executeQuery(); + + while (rsp.next()) { + int src = rsp.getInt(1); + int dst = rsp.getInt(2); + + consumer.accept(src, dst); + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domain links", ex); + } + } + + public void domainsByPattern(String pattern, IntConsumer idConsumer) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) { + stmt.setString(1, pattern); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + idConsumer.accept(rsp.getInt(1)); + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domains by pattern", ex); + } + } + + public interface DomainLinkConsumer { + void accept(int from, int to); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java index 5660d9a7..f80d307f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java @@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BuggyStandardPageRank; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.sql.SQLException; import java.util.HashSet; import java.util.Set; @@ -43,12 +44,14 @@ public class UpdateDomainRanksTool { var uploader = new Thread(() -> uploadThread(conn), "Uploader"); logger.info("Ranking"); - var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu"); rankMax = spr.size()*2; uploader.start(); - spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { + spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java index ec48cd17..f46fb390 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java @@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.sql.SQLException; import java.util.HashSet; import java.util.Set; @@ -45,7 +46,9 @@ public class UpdateDomainRanksTool2 { logger.info("Ranking"); // "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com", // "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net" - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); // var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu"); // var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); @@ -58,7 +61,7 @@ public class UpdateDomainRanksTool2 { rankMax = rpr.size(); - rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { + rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java index 53997194..ea1946fc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java @@ -6,6 +6,7 @@ import com.google.common.hash.Hashing; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; @@ -73,10 +74,12 @@ public class CrawlJobExtractorPageRankMain { Gson gson = new GsonBuilder().create(); - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); rpr.setMaxKnownUrls(750); - var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false); + var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size()); try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java index c87088f6..2f309b07 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java @@ -13,33 +13,14 @@ import java.util.Optional; @ImplementedBy(EdgeDataStoreDaoImpl.class) public interface EdgeDataStoreDao { - boolean isBlacklisted(EdgeDomain domain); - EdgeId getDomainId(EdgeDomain domain); List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist backlist, int count); + List getRandomDomains(int count, EdgeDomainBlacklist backlist); List getUrlDetailsMulti(List> ids); - EdgeDomain getDomain(EdgeId id); - Optional> resolveAmbiguousDomain(String name); - - - int getPagesKnown(EdgeId domainId); - int getPagesVisited(EdgeId domainId); - int getPagesIndexed(EdgeId domainId); - - int getIncomingLinks(EdgeId domainId); - int getOutboundLinks(EdgeId domainId); - - double getDomainQuality(EdgeId domainId); - - EdgeDomainIndexingState getDomainState(EdgeId domainId); - - List getLinkingDomains(EdgeId domainId); - - double getRank(EdgeId domainId); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 2519a745..430e7603 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -33,7 +33,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { private final Cache> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build(); private final Cache> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); - private static final String DEFAULT_PROTOCOL = "http"; public static double QUALITY_LOWER_BOUND_CUTOFF = -15.; @Inject public EdgeDataStoreDaoImpl(HikariDataSource dataSource) @@ -48,23 +47,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { domainIdCache.invalidateAll(); } - @SneakyThrows - @Override - public boolean isBlacklisted(EdgeDomain domain) { - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { - stmt.setString(1, domain.domain); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return true; - } else { - return false; - } - } - } - } - @SneakyThrows @Override public EdgeId getDomainId(EdgeDomain domain) { @@ -108,13 +90,12 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { try (var stmt = connection.prepareStatement( """ - SELECT ID, URL, + SELECT ID, URL, TITLE, DESCRIPTION, - WORDS_TOTAL, FORMAT, FEATURES, + WORDS_TOTAL, FORMAT, FEATURES, IP, DOMAIN_STATE, DATA_HASH FROM EC_URL_VIEW WHERE ID IN """ + idString)) { -// "SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) { stmt.setFetchSize(ids.size()); var rsp = stmt.executeQuery(); @@ -125,7 +106,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { rsp.getString(4), // description -5, // quality rsp.getInt(5), // wordsTotal - rsp.getString(6), // foramt + rsp.getString(6), // format rsp.getInt(7), // features rsp.getString(8), // ip EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState @@ -179,9 +160,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -210,9 +189,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -244,9 +221,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -262,7 +237,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { @Override public List getRandomDomains(int count, EdgeDomainBlacklist blacklist) { - final String q = "SELECT DOMAIN_ID,DOMAIN_NAME FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?"; + final String q = """ + SELECT DOMAIN_ID, DOMAIN_NAME + FROM EC_RANDOM_DOMAINS + INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID + WHERE STATE<2 + AND DOMAIN_ALIAS IS NULL + ORDER BY RAND() + LIMIT ? + """; List domains = new ArrayList<>(count); try (var conn = dataSource.getConnection()) { try (var stmt = conn.prepareStatement(q)) { @@ -273,9 +256,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -302,223 +283,4 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - - @Override - public Optional> resolveAmbiguousDomain(String name) { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, "https://"+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, "http://"+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, "https://www."+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, "http://www."+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - } catch (SQLException throwables) { - logger.info("Could not resolve domain id for {}", name); - } - - return Optional.empty(); - } - - @SneakyThrows - @Override - public int getPagesKnown(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - @Override - public int getPagesVisited(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - - @SneakyThrows - @Override - public int getPagesIndexed(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - @Override - public int getIncomingLinks(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - @SneakyThrows - @Override - public int getOutboundLinks(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - @Override - public double getDomainQuality(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return -5; - } - } - - @Override - public EdgeDomainIndexingState getDomainState(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return EdgeDomainIndexingState.fromCode(rsp.getInt(1)); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return EdgeDomainIndexingState.ERROR; - } - - @Override - public List getLinkingDomains(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - List results = new ArrayList<>(25); - try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - results.add(new EdgeDomain(rsp.getString(1))); - } - return results; - } catch (Exception ex) { - logger.error("DB error", ex); - } - - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return Collections.emptyList(); - } - - @Override - public double getRank(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return 1; - } - - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java index c42fcf53..a12b249e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java @@ -10,7 +10,7 @@ import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterStandardPageRank; import nu.marginalia.util.ranking.BuggyStandardPageRank; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.edge.index.model.RankingSettings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,14 +18,17 @@ import org.slf4j.LoggerFactory; @Singleton public class SearchIndexDao { private final HikariDataSource dataSource; + private RankingDomainFetcher rankingDomains; private final RankingSettings rankingSettings; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject public SearchIndexDao(HikariDataSource dataSource, + RankingDomainFetcher rankingDomains, RankingSettings rankingSettings) { this.dataSource = dataSource; + this.rankingDomains = rankingDomains; this.rankingSettings = rankingSettings; logger.info("SearchIndexDao ranking settings = {}", rankingSettings); } @@ -63,36 +66,36 @@ public class SearchIndexDao { @SneakyThrows public TIntList getRetroDomains() { - var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2, false); + var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2); } @SneakyThrows public TIntList getSmallWebDomains() { - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new)); + var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new)); rpr.setMaxKnownUrls(750); - return rpr.pageRankWithPeripheralNodes(rpr.size(), false); + return rpr.pageRankWithPeripheralNodes(rpr.size()); } @SneakyThrows public TIntList getAcademiaDomains() { - var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2, false); + var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2); } @SneakyThrows public TIntList getStandardDomains() { - var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2, false); + var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2); } @SneakyThrows public TIntList getSpecialDomains() { TIntArrayList results = new TIntArrayList(); try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2") + var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'") ) { var rs = stmt.executeQuery(); while (rs.next()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index cb778947..8daf790a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -54,8 +54,11 @@ public class EdgeDomain implements WideHashable { } } } + } - + public EdgeUrl toRootUrl() { + // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http + return new EdgeUrl("http", this, null, "/"); } public String toString() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index 54179d64..1d3fd2b2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -1,24 +1,43 @@ package nu.marginalia.wmsa.edge.search.siteinfo; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.search.model.DomainInformation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.inject.Inject; import javax.inject.Singleton; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Optional; +/* + TODO: This class needs to be refactored, a lot of + these SQL queries are redundant and can be + collapsed into one single query that fetches + all the information + */ @Singleton public class DomainInformationService { - private EdgeDataStoreDao dataStore; + private EdgeDataStoreDaoImpl dataStoreDao; + private HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public DomainInformationService(EdgeDataStoreDao dataStore) { - this.dataStore = dataStore; + public DomainInformationService( + EdgeDataStoreDaoImpl dataStoreDao, + HikariDataSource dataSource) { + this.dataStoreDao = dataStoreDao; + this.dataSource = dataSource; } @@ -28,29 +47,29 @@ public class DomainInformationService { if (domainId == null) { return Optional.empty(); } - EdgeDomain domain = dataStore.getDomain(domainId); + EdgeDomain domain = dataStoreDao.getDomain(domainId); - boolean blacklisted = dataStore.isBlacklisted(domain); - int pagesKnown = dataStore.getPagesKnown(domainId); - int pagesVisited = dataStore.getPagesVisited(domainId); - int pagesIndexed = dataStore.getPagesIndexed(domainId); - int incomingLinks = dataStore.getIncomingLinks(domainId); - int outboundLinks = dataStore.getOutboundLinks(domainId); - double rank = Math.round(10000.0*(1.0-dataStore.getRank(domainId)))/100; - EdgeDomainIndexingState state = dataStore.getDomainState(domainId); - double nominalQuality = Math.round(100*100*Math.exp(dataStore.getDomainQuality(domainId)))/100.; - List linkingDomains = dataStore.getLinkingDomains(domainId); + boolean blacklisted = isBlacklisted(domain); + int pagesKnown = getPagesKnown(domainId); + int pagesVisited = getPagesVisited(domainId); + int pagesIndexed = getPagesIndexed(domainId); + int incomingLinks = getIncomingLinks(domainId); + int outboundLinks = getOutboundLinks(domainId); + double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; + EdgeDomainIndexingState state = getDomainState(domainId); + double nominalQuality = Math.round(100*100*Math.exp(getDomainQuality(domainId)))/100.; + List linkingDomains = getLinkingDomains(domainId); return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains)); } private EdgeId getDomainFromPartial(String site) { try { - return dataStore.getDomainId(new EdgeDomain(site)); + return dataStoreDao.getDomainId(new EdgeDomain(site)); } catch (Exception ex) { try { - return dataStore.getDomainId(new EdgeDomain(site)); + return dataStoreDao.getDomainId(new EdgeDomain(site)); } catch (Exception ex2) { return null; @@ -58,4 +77,178 @@ public class DomainInformationService { } } + + @SneakyThrows + public boolean isBlacklisted(EdgeDomain domain) { + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { + stmt.setString(1, domain.domain); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return true; + } else { + return false; + } + } + } + } + + @SneakyThrows + public int getPagesKnown(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + public int getPagesVisited(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + + @SneakyThrows + public int getPagesIndexed(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + public int getIncomingLinks(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + @SneakyThrows + public int getOutboundLinks(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + public double getDomainQuality(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getDouble(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return -5; + } + } + + public EdgeDomainIndexingState getDomainState(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return EdgeDomainIndexingState.fromCode(rsp.getInt(1)); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return EdgeDomainIndexingState.ERROR; + } + + public List getLinkingDomains(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + List results = new ArrayList<>(25); + try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + results.add(new EdgeDomain(rsp.getString(1))); + } + return results; + } catch (Exception ex) { + logger.error("DB error", ex); + } + + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return Collections.emptyList(); + } + + public double getRank(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getDouble(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return 1; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java index bb946238..1251f626 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.tools; import com.google.inject.Inject; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; @@ -59,7 +60,9 @@ public class IndexMergerMain { } var hikari = new DatabaseModule().provideConnection(); - var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings())); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, domains, new RankingSettings())); var blacklist = new EdgeDomainBlacklistImpl(hikari); new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist); From 2faaed3393d5f2cb5f841ba49a87726922f9a44e Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 8 Jun 2022 16:52:33 +0200 Subject: [PATCH 17/27] Fixed conversion bug SQL->EdgeDomainIndexingState --- .../model/crawl/EdgeDomainIndexingState.java | 31 +++++-------------- .../siteinfo/DomainInformationService.java | 2 +- 2 files changed, 9 insertions(+), 24 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java index 119da59d..b10d0e88 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java @@ -1,27 +1,12 @@ package nu.marginalia.wmsa.edge.model.crawl; public enum EdgeDomainIndexingState { - ACTIVE(0), - EXHAUSTED(1), - SPECIAL(2), - SOCIAL_MEDIA(3), - BLOCKED(-1), - REDIR(-2), - ERROR(-3), - UNKNOWN(-100); - - public final int code; - - EdgeDomainIndexingState(int code) { - this.code = code; - } - - public static EdgeDomainIndexingState fromCode(int code) { - for (var state : values()) { - if (state.code == code) { - return state; - } - } - return UNKNOWN; - } + ACTIVE, + EXHAUSTED, + SPECIAL, + SOCIAL_MEDIA, + BLOCKED, + REDIR, + ERROR, + UNKNOWN } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index 1d3fd2b2..496fe57b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -203,7 +203,7 @@ public class DomainInformationService { stmt.setInt(1, domainId.getId()); var rsp = stmt.executeQuery(); if (rsp.next()) { - return EdgeDomainIndexingState.fromCode(rsp.getInt(1)); + return EdgeDomainIndexingState.valueOf(rsp.getString(1)); } } catch (Exception ex) { logger.error("DB error", ex); From 495e6a1639dc070dfe8596caf36442dc9e2c8507 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 8 Jun 2022 16:52:46 +0200 Subject: [PATCH 18/27] Use 64 bit path hash for EC_URL --- .../wmsa/edge/converting/loader/SqlLoadUrls.java | 10 ++++++++-- .../src/main/resources/sql/edge-crawler-cache.sql | 3 +-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index fd698c82..ba9ae43a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -1,11 +1,13 @@ package nu.marginalia.wmsa.edge.converting.loader; +import com.google.common.hash.Hashing; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.charset.StandardCharsets; import java.sql.SQLException; import java.sql.Types; @@ -28,7 +30,7 @@ public class SqlLoadUrls { IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN PORT INT, IN PATH VARCHAR(255), - IN PATH_HASH INT + IN PATH_HASH BIGINT ) BEGIN INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; @@ -59,7 +61,7 @@ public class SqlLoadUrls { insertCall.setNull(3, Types.INTEGER); } insertCall.setString(4, url.path); - insertCall.setInt(5, url.path.hashCode()); + insertCall.setLong(5, hashPath(url.path)); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -91,4 +93,8 @@ public class SqlLoadUrls { logger.warn("SQL error inserting URLs", ex); } } + + private long hashPath(String path) { + return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong(); + } } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 6c99eccf..2e517ac9 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -50,7 +50,7 @@ CREATE TABLE IF NOT EXISTS EC_URL ( PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin, PORT INT, - PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", + PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", VISITED BOOLEAN NOT NULL DEFAULT FALSE, STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', @@ -173,7 +173,6 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY ( ); CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED); -CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED); CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP); ---; From 65aee9419dcec7803dc788d21fde2a918a791c11 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 9 Jun 2022 21:25:31 +0200 Subject: [PATCH 19/27] Tidy up --- .../util/ranking/RankingAlgorithm.java | 35 ++++--------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java index 875031f1..4d255087 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java @@ -17,44 +17,21 @@ import java.util.stream.IntStream; import it.unimi.dsi.fastutil.ints.IntArrays; public abstract class RankingAlgorithm { - final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); - final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); - final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); + protected final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); + protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); + protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); - TIntArrayList[] linkDataSrc2Dest; - TIntArrayList[] linkDataDest2Src; + protected TIntArrayList[] linkDataSrc2Dest; + protected TIntArrayList[] linkDataDest2Src; public final Set originDomains = new HashSet<>(); public final Set originDomainIds = new HashSet<>(); private int maxKnownUrls = Integer.MAX_VALUE; - private static final boolean getNames = true; - private final Logger logger = LoggerFactory.getLogger(getClass()); - private RankingDomainFetcher domains; - public static void main(String... args) throws IOException { - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - - var rpr = new BuggyReversePageRank(domains, "wiki.xxiivv.com"); - var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu"); - - var rankVector = spr.pageRankVector(); - var norm = rankVector.norm(); - rpr.pageRank(i -> rankVector.get(i) / norm, 25).forEach(i -> { - System.out.println(spr.domainNameFromId(i)); - return true; - }); - } - - public String domainNameFromId(int id) { - return domainsById.get(id).name; - } - public boolean isPeripheral(int id) { - return domainsById.get(id).peripheral; - } + private final RankingDomainFetcher domains; public RankingAlgorithm(RankingDomainFetcher domains, String... origins) { this.domains = domains; From 389818c6c36a963a50f268d00bd5de99c6251f5b Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 9 Jun 2022 21:47:59 +0200 Subject: [PATCH 20/27] Make website url configurable for search engine redirects --- .../nu/marginalia/wmsa/configuration/WebsiteUrl.java | 7 +++++++ .../wmsa/edge/search/EdgeSearchModule.java | 2 ++ .../wmsa/edge/search/EdgeSearchService.java | 12 +++++++----- 3 files changed, 16 insertions(+), 5 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WebsiteUrl.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WebsiteUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WebsiteUrl.java new file mode 100644 index 00000000..8e3f8c4c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WebsiteUrl.java @@ -0,0 +1,7 @@ +package nu.marginalia.wmsa.configuration; + +public record WebsiteUrl(String url) { + public String withPath(String path) { + return url + path; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java index b7290d70..9db18272 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java @@ -2,12 +2,14 @@ package nu.marginalia.wmsa.edge.search; import com.google.inject.AbstractModule; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.configuration.WebsiteUrl; import nu.marginalia.wmsa.configuration.WmsaHome; public class EdgeSearchModule extends AbstractModule { public void configure() { bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); + bind(WebsiteUrl.class).toInstance(new WebsiteUrl(System.getProperty("website-url", "https://search.marginalia.nu/"))); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java index 329322a2..fa2d06e0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java @@ -8,6 +8,7 @@ import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.wmsa.api.model.ApiSearchResult; import nu.marginalia.wmsa.api.model.ApiSearchResults; +import nu.marginalia.wmsa.configuration.WebsiteUrl; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; @@ -34,7 +35,7 @@ public class EdgeSearchService extends Service { private final EdgeIndexClient indexClient; private final EdgeSearchOperator searchOperator; private final CommandEvaluator searchCommandEvaulator; - + private final WebsiteUrl websiteUrl; private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class); @SneakyThrows @@ -45,13 +46,14 @@ public class EdgeSearchService extends Service { Initialization initialization, MetricsServer metricsServer, EdgeSearchOperator searchOperator, - CommandEvaluator searchCommandEvaulator - ) { + CommandEvaluator searchCommandEvaulator, + WebsiteUrl websiteUrl) { super(ip, port, initialization, metricsServer); this.indexClient = indexClient; this.searchOperator = searchOperator; this.searchCommandEvaulator = searchCommandEvaulator; + this.websiteUrl = websiteUrl; Spark.staticFiles.expireTime(600); @@ -79,7 +81,7 @@ public class EdgeSearchService extends Service { final String query = URLEncoder.encode(String.format("%s site:%s", queryRaw, site), StandardCharsets.UTF_8); final String profile = request.queryParamOrDefault("profile", "yolo"); - response.redirect("https://search.marginalia.nu/search?query="+query+"&profile="+profile); + response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile)); return null; } @@ -141,7 +143,7 @@ public class EdgeSearchService extends Service { final String queryParam = request.queryParams("query"); if (null == queryParam || queryParam.isBlank()) { - response.redirect("https://search.marginalia.nu/"); + response.redirect(websiteUrl.url()); return null; } From 3e64003252e7739f0bccf48cbdf166c530ee1f53 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 9 Jun 2022 22:19:29 +0200 Subject: [PATCH 21/27] Re-add quality property to URLs --- .../loader/SqlLoadProcessedDocument.java | 9 ++++++--- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 18 ++++++++++-------- .../main/resources/sql/edge-crawler-cache.sql | 9 +++++++-- .../loader/SqlLoadProcessedDocumentTest.java | 3 ++- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index b033e6ea..85b6c3fe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -33,10 +33,11 @@ public class SqlLoadProcessedDocument { IN LENGTH INT, IN FEATURES INT, IN STANDARD VARCHAR(32), + IN QUALITY DOUBLE, IN HASH INT) BEGIN SET FOREIGN_KEY_CHECKS=0; - REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH); + REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY); UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; SET FOREIGN_KEY_CHECKS=1; END @@ -47,6 +48,7 @@ public class SqlLoadProcessedDocument { IN STATE VARCHAR(32)) BEGIN UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; + DELETE FROM PAGE_DATA WHERE ID=URL_ID; END """); @@ -59,7 +61,7 @@ public class SqlLoadProcessedDocument { public void load(LoaderData data, List documents) { try (var conn = dataSource.getConnection(); - var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?)")) { + var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) { conn.setAutoCommit(false); for (var doc : documents) { @@ -76,7 +78,8 @@ public class SqlLoadProcessedDocument { stmt.setInt(5, doc.length()); stmt.setInt(6, doc.htmlFeatures()); stmt.setString(7, doc.standard().name()); - stmt.setInt(8, (int) doc.hash()); + stmt.setDouble(8, doc.quality()); + stmt.setInt(9, (int) doc.hash()); stmt.addBatch(); } var ret = stmt.executeBatch(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 430e7603..233ffd3a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -92,8 +92,10 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { """ SELECT ID, URL, TITLE, DESCRIPTION, + QUALITY, WORDS_TOTAL, FORMAT, FEATURES, - IP, DOMAIN_STATE, DATA_HASH + IP, DOMAIN_STATE, + DATA_HASH FROM EC_URL_VIEW WHERE ID IN """ + idString)) { stmt.setFetchSize(ids.size()); @@ -104,13 +106,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(3), // title rsp.getString(4), // description - -5, // quality - rsp.getInt(5), // wordsTotal - rsp.getString(6), // format - rsp.getInt(7), // features - rsp.getString(8), // ip - EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState - rsp.getInt(10), // dataHash + rsp.getDouble(5), // quality + rsp.getInt(6), // wordsTotal + rsp.getString(7), // format + rsp.getInt(8), // features + rsp.getString(9), // ip + EdgeDomainIndexingState.valueOf(rsp.getString(10)), // domainState + rsp.getInt(11), // dataHash EdgePageScoreAdjustment.zero(), // urlQualityAdjustment Integer.MAX_VALUE, // rankingId Double.MAX_VALUE, // termScore diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 2e517ac9..36ab040a 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -72,6 +72,7 @@ CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL, DATA_HASH INTEGER NOT NULL, + QUALITY DOUBLE NOT NULL, FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE ) @@ -122,15 +123,19 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS EC_DOMAIN.DOMAIN_TOP AS DOMAIN_TOP, EC_URL.ID AS ID, EC_DOMAIN.ID AS DOMAIN_ID, + EC_URL.VISITED AS VISITED, + + EC_PAGE_DATA.QUALITY AS QUALITY, EC_PAGE_DATA.DATA_HASH AS DATA_HASH, EC_PAGE_DATA.TITLE AS TITLE, EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION, - EC_DOMAIN.IP AS IP, - EC_DOMAIN.STATE AS STATE, EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL, EC_PAGE_DATA.FORMAT AS FORMAT, EC_PAGE_DATA.FEATURES AS FEATURES, + + EC_DOMAIN.IP AS IP, + EC_DOMAIN.STATE AS STATE, EC_DOMAIN.RANK AS RANK, EC_DOMAIN.STATE AS DOMAIN_STATE FROM EC_URL diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java index e81e44e3..ecb0e88a 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java @@ -75,7 +75,7 @@ class SqlLoadProcessedDocumentTest { EdgeHtmlStandard.HTML5, 100, 12345, - -5 + -3.14 ))); var details = dataStoreDao.getUrlDetailsMulti(List.of(new EdgeId<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/"))))); @@ -88,6 +88,7 @@ class SqlLoadProcessedDocumentTest { assertTrue(urlDetails.isAffiliate()); assertEquals(100, urlDetails.words); assertEquals(12345, urlDetails.dataHash); + assertEquals(-3.14, urlDetails.getUrlQuality()); } } \ No newline at end of file From 1de63f225d9d425ee89741e5a3fa1b00893c5c5b Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 14 Jun 2022 17:55:14 +0200 Subject: [PATCH 22/27] Added support for -style tags. --- .../processor/DocumentProcessor.java | 17 +++++----- .../processor/logic/LinkParser.java | 34 ++++++++++++++++--- .../crawling/retreival/CrawlerRetreiver.java | 14 ++++---- .../wmsa/edge/model/EdgeDomain.java | 1 + .../marginalia/wmsa/edge/model/EdgeUrl.java | 9 ++--- .../wmsa/edge/crawling/LinkParserTest.java | 34 +++++++++++++++++-- 6 files changed, 81 insertions(+), 28 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index ce6393f2..b205cdea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -185,26 +185,25 @@ public class DocumentProcessor { } private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { - var links = doc.getElementsByTag("a"); - var frames = doc.getElementsByTag("frame"); - var feeds = doc.select("link[rel=alternate]"); - LinkProcessor lp = new LinkProcessor(ret, baseUrl); + final LinkProcessor lp = new LinkProcessor(ret, baseUrl); - for (var atag : links) { + baseUrl = linkParser.getBaseLink(doc, baseUrl); + + for (var atag : doc.getElementsByTag("a")) { linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept); } - for (var frame : frames) { + for (var frame : doc.getElementsByTag("frame")) { linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); } - for (var link : feeds) { + for (var link : doc.select("link[rel=alternate]")) { feedExtractor - .getFeedFromAlternateTag(baseUrl, link) + .getFeedFromAlternateTag(baseUrl, link) .ifPresent(lp::acceptFeed); } - Set linkTerms = new HashSet<>(); + final Set linkTerms = new HashSet<>(); for (var domain : lp.getForeignDomains()) { linkTerms.add("links:"+domain.toString().toLowerCase()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index aedaf0f7..378182f2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -1,9 +1,12 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import com.google.common.base.CharMatcher; +import com.google.common.base.Strings; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jetbrains.annotations.Contract; +import org.jetbrains.annotations.Nullable; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,11 +29,11 @@ public class LinkParser { ".gz", ".asc", ".md5", ".asf", ".mov", ".sig", ".pub", ".iso"); @Contract(pure=true) - public Optional parseLink(EdgeUrl baseUrl, Element l) { + public Optional parseLink(EdgeUrl relativeBaseUrl, Element l) { return Optional.of(l) .filter(this::shouldIndexLink) .map(this::getUrl) - .map(link -> resolveUrl(baseUrl, link)) + .map(link -> resolveUrl(relativeBaseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -100,6 +103,8 @@ public class LinkParser { } private static final Pattern paramRegex = Pattern.compile("\\?.*$"); + private static final Pattern spaceRegex = Pattern.compile(" "); + @SneakyThrows private String resolveUrl(EdgeUrl baseUrl, String s) { s = paramRegex.matcher(s).replaceAll(""); @@ -111,10 +116,12 @@ public class LinkParser { // url looks like /my-page if (s.startsWith("/")) { - return baseUrl.sibling(s).toString(); + return baseUrl.withPath(s).toString(); } - return baseUrl.sibling(relativeNavigation(baseUrl) + s.replaceAll(" ", "%20")).toString(); + final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20"); + + return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString(); } // for a relative url that looks like /foo or /foo/bar; return / or /foo @@ -162,4 +169,23 @@ public class LinkParser { } return true; } + + @Nullable + public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) { + var baseTags = parsed.getElementsByTag("base"); + + try { + for (var tag : baseTags) { + String href = tag.attr("href"); + if (!Strings.isNullOrEmpty(href)) { + return new EdgeUrl(resolveUrl(documentUrl, href)); + } + } + } + catch (Exception ex) { + logger.warn("Failed to parse , falling back to document url"); + } + + return documentUrl; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index a7c08a24..2b27ed4d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -202,10 +202,11 @@ public class CrawlerRetreiver { return domain.equals(url.domain.toString().toLowerCase()); } - private void findLinks(EdgeUrl url, Document parsed) { + private void findLinks(EdgeUrl baseUrl, Document parsed) { + baseUrl = linkParser.getBaseLink(parsed, baseUrl); for (var link : parsed.getElementsByTag("a")) { - linkParser.parseLink(url, link) + linkParser.parseLink(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isForumLink(u)) @@ -213,7 +214,7 @@ public class CrawlerRetreiver { .ifPresent(queue::addLast); } for (var link : parsed.getElementsByTag("frame")) { - linkParser.parseFrame(url, link) + linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isForumLink(u)) @@ -221,7 +222,7 @@ public class CrawlerRetreiver { .ifPresent(queue::addLast); } for (var link : parsed.getElementsByTag("iframe")) { - linkParser.parseFrame(url, link) + linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isForumLink(u)) @@ -230,10 +231,11 @@ public class CrawlerRetreiver { } } - private Optional findCanonicalUrl(EdgeUrl url, Document parsed) { + private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { + baseUrl = baseUrl.withPath("/"); for (var link : parsed.select("link[rel=canonical]")) { - return linkParser.parseLink(url, link); + return linkParser.parseLink(baseUrl, link); } return Optional.empty(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index cb778947..53740c95 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -21,6 +21,7 @@ public class EdgeDomain implements WideHashable { @SneakyThrows public EdgeDomain(String host) { + Objects.requireNonNull(host, "domain name must not be null"); var dot = host.lastIndexOf('.'); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index 39bc475b..e82d4b7c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -79,11 +79,6 @@ public class EdgeUrl implements WideHashable { this.port = port(URI.getPort(), proto); } - public EdgeUrl sibling(String newPath) { - return new EdgeUrl(proto, domain, port, newPath); - } - - private static Integer port(Integer port, String protocol) { if (null == port || port < 1) { return null; @@ -120,5 +115,7 @@ public class EdgeUrl implements WideHashable { return (int) path.chars().filter(c -> c=='/').count(); } - + public EdgeUrl withPath(String s) { + return new EdgeUrl(proto, domain, port, s); + } } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java index 80c62153..d4a7e428 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java @@ -11,9 +11,8 @@ import static org.junit.jupiter.api.Assertions.*; class LinkParserTest { - private String parseLink(String href, String base) throws URISyntaxException { - var url = new EdgeUrl("http://www.marginalia.nu/" + base); - var domain = url.domain; + private String parseLink(String href, String relBase) throws URISyntaxException { + var url = new EdgeUrl("http://www.marginalia.nu/" + relBase); var parser = new LinkParser(); var stuff = Jsoup.parseBodyFragment("test"); var lnk = parser.parseLink( @@ -43,6 +42,7 @@ class LinkParserTest { void testAnchor() throws URISyntaxException { assertNull(parseLink("#test", "/")); } + @Test void testRelative() throws URISyntaxException { assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/")); @@ -51,4 +51,32 @@ class LinkParserTest { assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/foo/index.html")); assertEquals("http://www.marginalia.nu/test", parseLink("/test", "/foo/index.html")); } + + private EdgeUrl getBaseUrl(String href, EdgeUrl documentUrl) { + LinkParser lp = new LinkParser(); + + return lp.getBaseLink(Jsoup.parse(""), documentUrl); + } + + @Test + public void getBaseUrlTest() throws URISyntaxException { + assertEquals(new EdgeUrl("https://www.marginalia.nu/base"), + getBaseUrl("/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar"))); + + assertEquals(new EdgeUrl("https://memex.marginalia.nu/base"), + getBaseUrl("https://memex.marginalia.nu/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar"))); + + assertEquals(new EdgeUrl("https://www.marginalia.nu/test/base"), + getBaseUrl("base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar"))); + } + + @Test + public void testParseBadBaseLink() throws URISyntaxException { + LinkParser lp = new LinkParser(); + var url = new EdgeUrl("https://memex.marginalia.nu/"); + + assertEquals(url, lp.getBaseLink(Jsoup.parse(""), url)); + assertEquals(url, lp.getBaseLink(Jsoup.parse(""), url)); + assertEquals(url, lp.getBaseLink(Jsoup.parse(""), url)); + } } \ No newline at end of file From 8ba80931a9d909318ef5286d9353edb9cfc5e661 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 15 Jun 2022 12:59:56 +0200 Subject: [PATCH 23/27] Restructuring index code: Move dictionary --- .../nu/marginalia/wmsa/edge/index/IndexServicesFactory.java | 4 ++-- .../edge/index/{service => }/dictionary/DictionaryReader.java | 2 +- .../edge/index/{service => }/dictionary/DictionaryWriter.java | 2 +- .../edge/index/{service => }/dictionary/TokenCompressor.java | 2 +- .../nu/marginalia/wmsa/edge/index/service/SearchIndexes.java | 2 +- .../wmsa/edge/index/service/index/SearchIndexWriterImpl.java | 2 +- .../wmsa/edge/index/service/DictionaryWriterTest.java | 4 ++-- .../wmsa/edge/index/service/SearchIndexWriterTest.java | 2 +- .../wmsa/edge/index/service/TokenCompressorTest.java | 2 +- 9 files changed, 11 insertions(+), 11 deletions(-) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => }/dictionary/DictionaryReader.java (92%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => }/dictionary/DictionaryWriter.java (99%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => }/dictionary/TokenCompressor.java (97%) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index fb58ac0e..4d0c18e9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -7,8 +7,8 @@ import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.index.service.index.*; import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.slf4j.Logger; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryReader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java index 90d270d2..ca10c000 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.dictionary; +package nu.marginalia.wmsa.edge.index.dictionary; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryWriter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java index 9ce1b149..906231be 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.dictionary; +package nu.marginalia.wmsa.edge.index.dictionary; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/TokenCompressor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/TokenCompressor.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java index 9f26fffd..5a3d73ab 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/TokenCompressor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.dictionary; +package nu.marginalia.wmsa.edge.index.dictionary; import nu.marginalia.util.ByteFolder; import nu.marginalia.util.dict.DictionaryHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java index 91065101..dea842c6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java @@ -5,7 +5,7 @@ import com.google.inject.Singleton; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.slf4j.Logger; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java index 2f482815..d434042d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java @@ -4,7 +4,7 @@ import io.reactivex.rxjava3.disposables.Disposable; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java index 180576fc..b6e61aa2 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java @@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.junit.jupiter.api.Disabled; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java index 5f1d2a0f..6b219bad 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java index ee84472e..e780ed62 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java @@ -1,6 +1,6 @@ package nu.marginalia.wmsa.edge.index.service; -import nu.marginalia.wmsa.edge.index.service.dictionary.TokenCompressor; +import nu.marginalia.wmsa.edge.index.dictionary.TokenCompressor; import org.junit.jupiter.api.Test; import java.util.Arrays; From 88908c203db0913865facb5c7fc873d533cb4c82 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 15 Jun 2022 16:34:03 +0200 Subject: [PATCH 24/27] Refactoring conversion --- .../wmsa/edge/EdgeSearchE2ETest.java | 23 +++- marginalia_nu/src/e2e/resources/init.sh | 2 +- .../nu/marginalia/util/btree/BTreeWriter.java | 16 ++- .../marginalia/util/btree/WriteCallback.java | 4 +- .../util/btree/model/BTreeHeader.java | 3 +- .../marginalia/util/hash/LongPairHashMap.java | 47 ++++---- .../util/multimap/MultimapFileLong.java | 11 +- .../multimap/MultimapFileLongOffsetSlice.java | 70 ++++++++++++ .../util/multimap/MultimapFileLongSlice.java | 29 +++++ .../util/multimap/MultimapSearcher.java | 4 +- .../util/multimap/MultimapSorter.java | 4 +- .../loader/SqlLoadProcessedDocument.java | 2 +- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 12 +- .../index/{radix => }/EdgeIndexBucket.java | 10 +- .../wmsa/edge/index/EdgeIndexControl.java | 9 +- .../wmsa/edge/index/EdgeIndexService.java | 6 +- .../wmsa/edge/index/IndexServicesFactory.java | 12 +- .../ConversionUnnecessaryException.java | 2 +- .../SearchEngineRanking.java | 2 +- .../SearchIndexConverter.java | 108 +++++++----------- .../SearchIndexDao.java | 2 +- .../SearchIndexPartitioner.java | 4 +- .../SearchIndexPreconverter.java | 3 +- .../words/WordIndexLengthsTable.java | 10 ++ .../words/WordIndexOffsetsTable.java | 67 +++++++++++ .../conversion/words/WordIndexTables.java | 56 +++++++++ .../conversion/words/WordsTableWriter.java | 75 ++++++++++++ .../index => journal}/SearchIndexWriter.java | 2 +- .../SearchIndexWriterImpl.java | 2 +- .../IndexWordsTable.java} | 90 +++++++++------ .../index => reader}/SearchIndex.java | 6 +- .../index => reader}/SearchIndexReader.java | 10 +- .../{service => reader}/SearchIndexes.java | 8 +- .../query/IndexQueryBuilder.java | 4 +- .../query/IndexSearchBudget.java | 2 +- .../{service => reader}/query/Query.java | 2 +- .../wmsa/edge/index/service/SearchOrder.java | 6 - .../index/wordstable/IndexWordsTable.java | 48 -------- .../index/wordstable/WordsTableWriter.java | 85 -------------- .../model/search/EdgeSearchSpecification.java | 4 +- .../wmsa/edge/search/EdgeSearchOperator.java | 3 +- .../wmsa/edge/search/EdgeSearchProfile.java | 17 ++- .../command/commands/SiteSearchCommand.java | 4 +- .../edge/search/model/DomainInformation.java | 1 - .../wmsa/edge/search/query/QueryFactory.java | 1 - .../siteinfo/DomainInformationService.java | 3 +- .../wmsa/edge/tools/IndexMergerMain.java | 4 +- .../templates/edge/site-info-gmi.hdb | 1 - .../resources/templates/edge/site-info.hdb | 1 - .../util/btree/BTreeWriterTest.java | 26 ++--- .../util/hash/LongPairHashMapTest.java | 4 +- .../index/service/DictionaryWriterTest.java | 4 +- .../index/service/EdgeIndexClientTest.java | 6 +- .../index/service/SearchIndexWriterTest.java | 12 +- .../edge/search/query/QueryVariantsTest.java | 5 +- 55 files changed, 574 insertions(+), 380 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{radix => }/EdgeIndexBucket.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => conversion}/ConversionUnnecessaryException.java (80%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => conversion}/SearchEngineRanking.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => conversion}/SearchIndexConverter.java (75%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => conversion}/SearchIndexDao.java (98%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/query => conversion}/SearchIndexPartitioner.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => conversion}/SearchIndexPreconverter.java (97%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => journal}/SearchIndexWriter.java (88%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => journal}/SearchIndexWriterImpl.java (98%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index/wordstable/BtreeWordsTable.java => reader/IndexWordsTable.java} (58%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => reader}/SearchIndex.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => reader}/SearchIndexReader.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/SearchIndexes.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/query/IndexQueryBuilder.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/query/IndexSearchBudget.java (87%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/query/Query.java (73%) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index af43e462..08408de2 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -28,6 +28,7 @@ import java.util.ArrayList; import java.util.List; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; +import static org.junit.jupiter.api.Assertions.assertEquals; @Tag("e2e") @Testcontainers @@ -156,6 +157,16 @@ public class EdgeSearchE2ETest extends E2ETestBase { return wikipediaFiles.toString(); } + private List getTitlesFromSearchResults(String html) { + List ret = new ArrayList<>(); + + for (var title : Jsoup.parse(html).select(".card.search-result > h2")) { + ret.add(title.text()); + } + + return ret; + } + @Test public void testFrontPage() throws IOException { var driver = chrome.getWebDriver(); @@ -173,8 +184,9 @@ public class EdgeSearchE2ETest extends E2ETestBase { driver.get("http://proxyNginx/search?query=bird&profile=corpo"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); + assertEquals(List.of("Bird"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query")); } @@ -187,20 +199,23 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info")); } + @Test public void testSiteSearch() throws IOException { var driver = chrome.getWebDriver(); driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); + + assertEquals(List.of("Frog", "Binomial nomenclature", "Amphibian", "Mantis"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search")); } + @Test public void testBrowse() throws IOException { var driver = chrome.getWebDriver(); @@ -209,7 +224,6 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse")); } @Test @@ -220,7 +234,6 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define")); } @Test diff --git a/marginalia_nu/src/e2e/resources/init.sh b/marginalia_nu/src/e2e/resources/init.sh index 5409f787..50dbd406 100644 --- a/marginalia_nu/src/e2e/resources/init.sh +++ b/marginalia_nu/src/e2e/resources/init.sh @@ -69,4 +69,4 @@ memex memex dating dating EOF -WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file +WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java index 28ac4914..b43faca7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java @@ -3,6 +3,7 @@ package nu.marginalia.util.btree; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -12,9 +13,9 @@ import java.io.IOException; public class BTreeWriter { private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class); private final BTreeContext ctx; - private final MultimapFileLong map; + private final MultimapFileLongSlice map; - public BTreeWriter(MultimapFileLong map, BTreeContext ctx) { + public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) { this.map = map; this.ctx = ctx; } @@ -31,13 +32,18 @@ public class BTreeWriter { return size; } - public long write(long offset, int numEntries, WriteCallback writeIndex) + /** Construct a BTree with numEntries entries at offset in the associated map + * + * @return The size of the written data + */ + public long write(long offset, int numEntries, WriteCallback writeIndexCallback) throws IOException { - var header = makeHeader(offset, numEntries); + BTreeHeader header = makeHeader(offset, numEntries); header.write(map, offset); - writeIndex.write(header.dataOffsetLongs()); + + writeIndexCallback.write(map.atOffset(header.dataOffsetLongs())); if (header.layers() < 1) { return ctx.calculateSize(numEntries); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java index 70bd8132..a6225db1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java @@ -1,7 +1,9 @@ package nu.marginalia.util.btree; +import nu.marginalia.util.multimap.MultimapFileLongSlice; + import java.io.IOException; public interface WriteCallback { - void write(long offset) throws IOException; + void write(MultimapFileLongSlice slice) throws IOException; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java index 4951f5b8..8d68b424 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java @@ -1,6 +1,7 @@ package nu.marginalia.util.btree.model; import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) { public BTreeHeader { @@ -28,7 +29,7 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon return padding; } - public void write(MultimapFileLong dest, long offset) { + public void write(MultimapFileLongSlice dest, long offset) { dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL)); dest.put(offset+1, indexOffsetLongs); dest.put(offset+2, dataOffsetLongs); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java index 6f8912a9..d1e056b9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java @@ -1,9 +1,7 @@ package nu.marginalia.util.hash; -import io.prometheus.client.Gauge; import lombok.EqualsAndHashCode; import lombok.Getter; -import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.PrimeUtil; import org.slf4j.Logger; @@ -17,9 +15,7 @@ import static java.lang.Math.round; */ public class LongPairHashMap { private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class); - private static final Gauge probe_count_metrics - = Gauge.build("wmsa_wordfile_hash_map_probe_count", "Probing Count") - .register(); + private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police private final long hashTableSize; private final MultimapFileLong data; @@ -27,26 +23,37 @@ public class LongPairHashMap { private int sz = 0; private static final int HEADER_SIZE = 2; - public LongPairHashMap(MultimapFileLong data, long size) { + private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) { this.data = data; - // Actually use a prime size for Donald Knuth reasons - hashTableSize = PrimeUtil.nextPrime(size, 1); - maxProbeLength = hashTableSize / 2; + this.hashTableSize = hashTableSize; + this.maxProbeLength = maxProbeLength; + } - logger.debug("Table size = " + hashTableSize); + public static LongPairHashMap createNew(MultimapFileLong data, long size) { + var tableSize = PrimeUtil.nextPrime(size, 1); + var ret = new LongPairHashMap(data, tableSize, tableSize/2); - data.put(0, IndexWordsTable.Strategy.HASH.ordinal()); - data.put(1, hashTableSize); - for (int i = 2; i < hashTableSize; i++) { + data.put(0, MAGIC_WORD); + data.put(1, tableSize); + + for (int i = 2; i < tableSize; i++) { data.put(HEADER_SIZE + 2L*i, 0); } - } - public LongPairHashMap(MultimapFileLong data) { - this.data = data; - hashTableSize = data.get(1); - maxProbeLength = hashTableSize / 10; - logger.debug("Table size = " + hashTableSize); + return ret; + } + + public static LongPairHashMap loadExisting(MultimapFileLong data) { + long key = data.get(0); + + if (key != MAGIC_WORD) { + logger.warn("LongPairHashMap lacks magic word, could this be garbage data?"); + } + + var hashTableSize = data.get(1); + var maxProbeLength = hashTableSize / 10; + + return new LongPairHashMap(data, hashTableSize, maxProbeLength); } public int size() { @@ -91,8 +98,6 @@ public class LongPairHashMap { final var val = getCell(idx); if (!val.isSet()) { - probe_count_metrics.set(j); - return setValue(data, idx); } else if (val.getKey() == data.getKey()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java index dca8248e..f381a977 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java @@ -21,7 +21,7 @@ import static java.nio.channels.FileChannel.MapMode.READ_WRITE; import static nu.marginalia.util.FileSizeUtil.readableSize; -public class MultimapFileLong implements AutoCloseable { +public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { private final ArrayList buffers = new ArrayList<>(); private final ArrayList mappedByteBuffers = new ArrayList<>(); @@ -196,10 +196,12 @@ public class MultimapFileLong implements AutoCloseable { } } + @Override public long size() { return fileLength; } + @Override public void put(long idx, long val) { if (idx >= mappedSize) grow(idx); @@ -214,6 +216,7 @@ public class MultimapFileLong implements AutoCloseable { } } + @Override public long get(long idx) { if (idx >= mappedSize) grow(idx); @@ -229,10 +232,12 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void read(long[] vals, long idx) { read(vals, vals.length, idx); } + @Override public void read(long[] vals, int n, long idx) { if (idx+n >= mappedSize) { grow(idx+n); @@ -257,10 +262,12 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void write(long[] vals, long idx) { write(vals, vals.length, idx); } + @Override public void write(long[] vals, int n, long idx) { if (idx+n >= mappedSize) { grow(idx+n); @@ -285,6 +292,7 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void write(LongBuffer vals, long idx) { int n = vals.limit() - vals.position(); if (idx+n >= mappedSize) { @@ -310,6 +318,7 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException { int length = (int)(sourceEnd - sourceStart); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java new file mode 100644 index 00000000..c2630ddc --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java @@ -0,0 +1,70 @@ +package nu.marginalia.util.multimap; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; + +public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice { + private final long off; + private final MultimapFileLongSlice map; + + public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) { + this.off = off; + this.map = map; + } + + @Override + public long size() { + return map.size() - off; + } + + @Override + public void put(long idx, long val) { + map.put(off+idx, val); + } + + @Override + public long get(long idx) { + return map.get(off+idx); + } + + @Override + public void read(long[] vals, long idx) { + map.read(vals, idx+off); + } + + @Override + public void read(long[] vals, int n, long idx) { + map.read(vals, n, idx+off); + } + + @Override + public void write(long[] vals, long idx) { + map.write(vals, idx+off); + } + + @Override + public void write(long[] vals, int n, long idx) { + map.write(vals, n, idx+off); + } + + @Override + public void write(LongBuffer vals, long idx) { + map.write(vals, idx+off); + } + + @Override + public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) + throws IOException { + map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd); + } + + @Override + public MultimapFileLongSlice atOffset(long off) { + // If we don't override this, the default implementation would build a pyramid of + // MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...))) + // if this is called iteratively (e.g. to walk over a file) + + return new MultimapFileLongOffsetSlice(map, this.off + off); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java new file mode 100644 index 00000000..abf29f51 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java @@ -0,0 +1,29 @@ +package nu.marginalia.util.multimap; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; + +public interface MultimapFileLongSlice { + long size(); + + void put(long idx, long val); + + long get(long idx); + + void read(long[] vals, long idx); + + void read(long[] vals, int n, long idx); + + void write(long[] vals, long idx); + + void write(long[] vals, int n, long idx); + + void write(LongBuffer vals, long idx); + + void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException; + + default MultimapFileLongSlice atOffset(long off) { + return new MultimapFileLongOffsetSlice(this, off); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java index c961ac0e..005888d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java @@ -4,9 +4,9 @@ import lombok.experimental.Delegate; public class MultimapSearcher { @Delegate - private final MultimapFileLong mmf; + private final MultimapFileLongSlice mmf; - public MultimapSearcher(MultimapFileLong mmf) { + public MultimapSearcher(MultimapFileLongSlice mmf) { this.mmf = mmf; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java index 6ca4f64f..61dd04c4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java @@ -13,10 +13,10 @@ import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE; public class MultimapSorter { private final Path tmpFileDir; private final int internalSortLimit; - private final MultimapFileLong multimapFileLong; + private final MultimapFileLongSlice multimapFileLong; private final long[] buffer; - public MultimapSorter(MultimapFileLong multimapFileLong, Path tmpFileDir, int internalSortLimit) { + public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) { this.multimapFileLong = multimapFileLong; this.tmpFileDir = tmpFileDir; this.internalSortLimit = internalSortLimit; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index 85b6c3fe..e2e25fff 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -48,7 +48,7 @@ public class SqlLoadProcessedDocument { IN STATE VARCHAR(32)) BEGIN UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; - DELETE FROM PAGE_DATA WHERE ID=URL_ID; + DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID; END """); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 233ffd3a..30ea2256 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -135,7 +135,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { final Set domains = new HashSet<>(count*3); final String q = """ - SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART, COUNT(*) AS CNT + SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT FROM EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID @@ -169,7 +169,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { if (domains.size() < count/2) { final String q2 = """ - SELECT EC_DOMAIN.ID, URL_PART + SELECT EC_DOMAIN.ID, DOMAIN_NAME FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID @@ -199,11 +199,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { if (domains.size() < count/2) { final String q3 = """ - SELECT EC_DOMAIN.ID, URL_PART - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + SELECT EC_DOMAIN.ID, DOMAIN_NAME + FROM EC_DOMAIN + INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID - INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID + INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE B.DEST_DOMAIN_ID=? AND STATE<2 AND KNOWN_URLS<1000 diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java index 2e8fdcd2..05bcfe75 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.radix; +package nu.marginalia.wmsa.edge.index; import nu.marginalia.wmsa.edge.index.EdgeIndexControl; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriter; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.service.query.Query; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriter; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.reader.query.Query; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index b590af55..ab7c73fe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.Inject; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException; +import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; + +import java.io.IOException; public class EdgeIndexControl { @@ -27,7 +29,10 @@ public class EdgeIndexControl { System.gc(); } catch (ConversionUnnecessaryException unnecessary) { - + // swallow quietly + } + catch (IOException e) { + e.printStackTrace(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index a04a4c83..de6276a8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -15,9 +15,9 @@ import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.edge.index.model.*; -import nu.marginalia.wmsa.edge.index.service.SearchIndexes; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index 4d0c18e9..61e64b41 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -5,12 +5,16 @@ import com.google.inject.Singleton; import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.*; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -89,7 +93,7 @@ public class IndexServicesFactory { } - public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException { + public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { return new SearchIndexConverter(block, id, tmpFileDir, preconverterOutputFile.get(id), indexWriteWordsFile.get(id, block.id), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java index fd7f529f..2242f476 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.conversion; public class ConversionUnnecessaryException extends Exception { public ConversionUnnecessaryException() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java index abaced82..220a9708 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.wmsa.edge.index.conversion; import gnu.trove.list.TIntList; import gnu.trove.map.hash.TIntIntHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java index c9b69386..0827b4e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import com.google.inject.name.Named; @@ -6,9 +6,10 @@ import gnu.trove.set.hash.TIntHashSet; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter; import nu.marginalia.util.btree.BTreeWriter; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.multimap.MultimapFileLong; @@ -32,18 +33,24 @@ public class SearchIndexConverter { private final long fileLength; private final long urlsFileSize; + private final Path tmpFileDir; + private final FileChannel urlsTmpFileChannel; private final int wordCount; private final MultimapFileLong urlsTmpFileMap; private final Logger logger = LoggerFactory.getLogger(getClass()); private final IndexBlock block; private final int bucketId; - @org.jetbrains.annotations.NotNull + + private final File urlsFile; private final SearchIndexPartitioner partitioner; private final TIntHashSet spamDomains; private final MultimapSorter urlTmpFileSorter; + private final static int internalSortLimit = + Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256; + @SneakyThrows public static long wordCount(File inputFile) { try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) { @@ -52,7 +59,6 @@ public class SearchIndexConverter { } } - @SneakyThrows @Inject public SearchIndexConverter(IndexBlock block, int bucketId, @Named("tmp-file-dir") Path tmpFileDir, @@ -61,13 +67,15 @@ public class SearchIndexConverter { @Named("edge-index-write-urls-file") File outputFileUrls, SearchIndexPartitioner partitioner, EdgeDomainBlacklist blacklist) - throws ConversionUnnecessaryException + throws ConversionUnnecessaryException, IOException { this.block = block; this.bucketId = bucketId; - urlsFile = outputFileUrls; + this.tmpFileDir = tmpFileDir; + this.urlsFile = outputFileUrls; this.partitioner = partitioner; this.spamDomains = blacklist.getSpamDomains(); + logger.info("Converting {} ({}) {}", block.id, block, inputFile); Files.deleteIfExists(outputFileWords.toPath()); @@ -89,18 +97,16 @@ public class SearchIndexConverter { urlsFileSize = getUrlsSize(buffer, inputChannel); var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - - var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); urlsTmpFileChannel = urlsTmpFileRaf.getChannel(); urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false); - urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256); + urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block); - long[] wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); + WordIndexOffsetsTable wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block); - createUrlTable(tmpFileDir, buffer, raf, wordIndexTable); + createUrlTable(buffer, raf, wordIndexTable); Files.delete(tmpUrlsFile); raf.close(); @@ -140,99 +146,69 @@ public class SearchIndexConverter { return reader.size; } - private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException { - logger.debug("Table size = {}", wordIndexTable.length); - int[] wordIndex = new int[wordIndexTable.length]; + private void createUrlTable(ByteBuffer buffer, RandomAccessFile raf, WordIndexOffsetsTable wordOffsetsTable) throws IOException { + logger.info("Table size = {}", wordOffsetsTable.length()); + raf.seek(FILE_HEADER_SIZE); var channel = raf.getChannel(); try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) { - var reader = new IndexReader(buffer, channel) { + int[] wordWriteOffset = new int[wordOffsetsTable.length()]; + + new IndexReader(buffer, channel) { @Override public void eachWord(long urlId, int wordId) throws IOException { - if (wordId >= wordIndex.length) + if (wordId >= wordWriteOffset.length) return; - if (wordId != 0) { - if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) { - logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}", - wordId, - wordIndex[wordId], - wordIndexTable[wordId - 1], - wordIndexTable[wordId]); - throw new IllegalStateException(); - } - } if (wordId > 0) { - rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId)); + rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, translateUrl(urlId)); } else { - rwf.put(wordIndex[wordId]++, translateUrl(urlId)); + rwf.put(wordWriteOffset[wordId]++, translateUrl(urlId)); } } - }; - - reader.read(); + }.read(); rwf.write(urlsTmpFileChannel); } urlsTmpFileChannel.force(false); + logger.info("URL TMP Table: {} Mb", channel.position()/(1024*1024)); - logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024)); + if (wordOffsetsTable.length() > 0) { + logger.info("Sorting urls table"); + + wordOffsetsTable.forEach(urlTmpFileSorter::sort); - if (wordIndexTable.length > 0) { - logger.debug("Sorting urls table"); - sortUrls(wordIndexTable); urlsTmpFileMap.force(); } else { logger.warn("urls table empty -- nothing to sort"); } - - long idx = 0; - + logger.info("Writing BTree"); try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) { var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); - if (wordIndexTable[0] != 0) { - int start = 0; - int end = (int) wordIndexTable[0]; + wordOffsetsTable.fold((accumulatorIdx, start, length) -> { + // Note: The return value is accumulated into accumulatorIdx! - idx += writer.write(idx, (int) wordIndexTable[0], - offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); - } + return writer.write(accumulatorIdx, length, + slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length)); + }); - for (int i = 1; i < wordIndexTable.length; i++) { - if (wordIndexTable[i] != wordIndexTable[i - 1]) { - long start = wordIndexTable[i-1]; - long end = wordIndexTable[i]; - - idx += writer.write(idx, (int) (end-start), - offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); - } - } } catch (Exception e) { - e.printStackTrace(); + logger.error("Error while writing BTree", e); } } - @SneakyThrows - private void sortUrls(long[] wordIndices) { - urlTmpFileSorter.sort( 0, (int) wordIndices[0]); - - for (int i = 1; i < wordIndices.length; i++) { - urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1])); - } - } - - private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception { + private WordIndexOffsetsTable createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws IOException { inputChannel.position(FILE_HEADER_SIZE); logger.debug("Table size = {}", wordCount); WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount); - ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE); + ByteBuffer buffer = ByteBuffer.allocateDirect(8* SearchIndexWriterImpl.MAX_BLOCK_SIZE); logger.debug("Reading words"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java index a12b249e..fcf6d175 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java index cf281116..bf5a1d74 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java @@ -1,11 +1,9 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import com.google.inject.Singleton; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.service.SearchEngineRanking; -import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java index 5149b546..9e851025 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java @@ -1,10 +1,9 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java new file mode 100644 index 00000000..464e9388 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java @@ -0,0 +1,10 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +public class WordIndexLengthsTable { + final long[] table; + + public WordIndexLengthsTable(int size) { + this.table = new long[size]; + } + public void increment(int idx) { table[idx]++; } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java new file mode 100644 index 00000000..29b88509 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java @@ -0,0 +1,67 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +import java.io.IOException; + +public class WordIndexOffsetsTable { + final long[] table; + public final int numberOfUsedWords; + + public WordIndexOffsetsTable(long[] table, int numberOfUsedWords) { + + this.table = table; + this.numberOfUsedWords = numberOfUsedWords; + } + + public int length() { + return table.length; + } + + public void forEach(OffsetTableEntryConsumer o) throws IOException { + if (table[0] > 0) { + o.accept(0, (int) table[0]); + } + + for (int i = 1; i < table.length; i++) { + long start = table[i-1]; + int length = (int) (table[i] - start); + + if (length != 0) { + o.accept(start, length); + } + } + } + + /** + * Fold over each span in the file, left to right + */ + public long fold(OffsetTableEntryFoldConsumer o) throws IOException { + long total = 0; + + if (table[0] > 0) { + total = o.accept(total,0, (int) table[0]); + } + + for (int i = 1; i < table.length; i++) { + long start = table[i-1]; + int length = (int) (table[i] - start); + + if (length != 0) { + total += o.accept(total, start, length); + } + } + + return total; + } + + public long get(int i) { + return table[i]; + } + + public interface OffsetTableEntryConsumer { + void accept(long start, int length) throws IOException; + } + + public interface OffsetTableEntryFoldConsumer { + long accept(long accumulator, long start, int length) throws IOException; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java new file mode 100644 index 00000000..2056948b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java @@ -0,0 +1,56 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +/** Contains a stateful table of word index offsets, initially in lengths mode + * where the table contains how many postings exist for each word; then in offsets + * mode, where the lengths are converted into the necessary offsets for each block + * of document data. + * + * Caveat! This uses the same underlying array to conserve space. + * + */ +public class WordIndexTables { + private WordIndexLengthsTable lengthsTable; + private WordIndexOffsetsTable offsetsTable; + + private boolean converted = false; + + public WordIndexTables(int size) { + lengthsTable = new WordIndexLengthsTable(size); + } + + public WordIndexLengthsTable lengths() { + if (converted) throw new IllegalStateException("Table has been converted"); + + return lengthsTable; + } + + public WordIndexOffsetsTable offsets() { + if (!converted) throw new IllegalStateException("Table has not been converted"); + + return offsetsTable; + } + + public void convert() { + if (converted) throw new IllegalStateException("Table has been converted"); + + // Go from lengths to offsets, i.e. + // BEFORE: 1, 2, 1, 3, 0, 2 + // AFTER: 1, 3, 4, 7, 7, 9 + + long[] table = lengthsTable.table; + int numberOfUsedWords = 0; + + if (table[0] != 0) numberOfUsedWords = 1; + + for (int i = 1; i < table.length; i++) { + if (table[i] != 0) { + numberOfUsedWords++; + } + table[i] += table[i-1]; + } + + lengthsTable = null; + offsetsTable = new WordIndexOffsetsTable(table, numberOfUsedWords); + converted = true; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java new file mode 100644 index 00000000..7f762ff3 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java @@ -0,0 +1,75 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +import nu.marginalia.util.btree.BTreeWriter; +import nu.marginalia.util.btree.model.BTreeContext; +import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; +import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; + +import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext; + +public class WordsTableWriter { + private final WordIndexTables table; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8); + + public WordsTableWriter(int length) { + table = new WordIndexTables(length); + } + + public void acceptWord(int wordId) { + table.lengths().increment(wordId); + } + + public WordIndexOffsetsTable getTable() { + return table.offsets(); + } + + public void write(File file) throws IOException { + table.convert(); + + logger.info("Writing table - {} max", table.offsets().numberOfUsedWords); + + final int tableSize = table.offsets().numberOfUsedWords; + + try (var mmf = MultimapFileLong.forOutput(file.toPath(), tableSize/8L)) { + mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal()); + long offset = 1; + + var writer = new BTreeWriter(mmf, wordsBTreeContext); + + writer.write(offset, tableSize, this::writeBTreeBlock); + } + } + + private void writeBTreeBlock(MultimapFileLongSlice mapSlice) { + long urlFileOffset = 0; + int idx = 0; + + var offsetTable = table.offsets().table; + + if (offsetTable[0] != 0) { + int length = (int) offsetTable[0]; + mapSlice.put(idx++, (long)length<<32); + mapSlice.put(idx++, 0); + + urlFileOffset += (urlsBTreeContext.calculateSize(length)); + } + + for (int i = 1; i < offsetTable.length; i++) { + final int length = (int)(offsetTable[i] - offsetTable[i-1]); + + if (length > 0) { + mapSlice.put(idx++, (long)length << 32 | i); + mapSlice.put(idx++, urlFileOffset); + + urlFileOffset += (urlsBTreeContext.calculateSize(length)); + } + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java index ca5d70b3..11fc186a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.journal; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeDomain; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java index d434042d..cf76ada2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.journal; import io.reactivex.rxjava3.disposables.Disposable; import io.reactivex.rxjava3.schedulers.Schedulers; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java similarity index 58% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java index 0a6a70c0..2bde1aa7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java @@ -1,36 +1,80 @@ -package nu.marginalia.wmsa.edge.index.service.index.wordstable; +package nu.marginalia.wmsa.edge.index.reader; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.channels.FileChannel; import java.util.function.LongConsumer; -import static nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter.wordsBTreeContext; +import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wordsBTreeContext; -public class BtreeWordsTable extends IndexWordsTable{ - private final MultimapFileLong words; - private final BTreeReader reader; - private final BTreeHeader header; - private final int HEADER_OFFSET = 1; +public class IndexWordsTable implements AutoCloseable { + protected final MultimapFileLong words; + protected final BTreeReader reader; + protected final BTreeHeader header; + protected final int HEADER_OFFSET = 1; + final Logger logger = LoggerFactory.getLogger(getClass()); - public BtreeWordsTable(MultimapFileLong words) { + private static final int BUFFER_SIZE = 1024*1024*64; + + public IndexWordsTable(MultimapFileLong words) { this.words = words; - reader = new BTreeReader(words, wordsBTreeContext); header = reader.getHeader(HEADER_OFFSET); madvise(); } - private void madvise() { + public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException { + var wordsFile = openWordsFile(file); + long signature = wordsFile.get(0); + + if (signature == Strategy.BTREE.ordinal()) { + return new IndexWordsTable(wordsFile); + } + + throw new IllegalArgumentException("Unknown signature " + signature); + } + + private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { + return new MultimapFileLong(wordsFile, + FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false); + } + + public long positionForWord(int wordId) { + + long offset = reader.offsetForEntry(header, wordId); + if (offset < 0) { + return -1L; + } + + return words.get(offset+1); + } + + public int wordLength(int wordId) { + + long offset = reader.offsetForEntry(header, wordId); + if (offset < 0) { + return -1; + } + + return (int)(words.get(offset) >> 32); + } + + protected void madvise() { words.advice(NativeIO.Advice.Random); words.advice0(NativeIO.Advice.WillNeed); var h = reader.getHeader(HEADER_OFFSET); int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs()); + words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); words.pokeRange(h.indexOffsetLongs(), length); } @@ -58,31 +102,13 @@ public class BtreeWordsTable extends IndexWordsTable{ } } - @Override - public long positionForWord(int wordId) { - - long offset = reader.offsetForEntry(header, wordId); - if (offset < 0) { - return -1L; - } - - return words.get(offset+1); - } - - @Override - public int wordLength(int wordId) { - - long offset = reader.offsetForEntry(header, wordId); - if (offset < 0) { - return -1; - } - - return (int)(words.get(offset) >> 32); - } - @Override public void close() throws Exception { words.close(); } + public enum Strategy { + BTREE + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java index 17e62437..042f8f54 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java @@ -1,20 +1,18 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.reader; import com.google.inject.Inject; import com.google.inject.name.Named; import com.upserve.uppend.blobs.NativeIO; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.multimap.MultimapFileLong; -import org.eclipse.jetty.util.thread.ThreadPool; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; -import java.util.concurrent.ForkJoinPool; import java.util.stream.LongStream; public class SearchIndex implements AutoCloseable { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 7baeb8ae..8e7fea81 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.reader; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.query.IndexQueryBuilder; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.service.query.Query; +import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.reader.query.Query; import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -105,10 +105,8 @@ public class SearchIndexReader implements AutoCloseable { .mapToLong(idx -> idx.numUrls(word)) .sum() ); - } - public IndexBlock getBlockForResult(int searchTerm, long urlId) { for (var block : indicesBySearchOrder) { var index = indices.get(block); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java index dea842c6..863c0c65 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.wmsa.edge.index.reader; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java index be217057..6f54dd2d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.reader.query; import com.google.common.collect.Streams; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; import java.util.Collection; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java index 2ec30e65..3608f70a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.reader.query; public class IndexSearchBudget { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java index 09f7701b..5f343d54 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.reader.query; import java.util.stream.LongStream; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java deleted file mode 100644 index d1c9f10a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -public enum SearchOrder { - ASCENDING, - REVERSED -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java deleted file mode 100644 index 5b557db1..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java +++ /dev/null @@ -1,48 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service.index.wordstable; - -import nu.marginalia.util.multimap.MultimapFileLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; -import java.util.function.LongConsumer; - -public abstract class IndexWordsTable implements AutoCloseable { - final Logger logger = LoggerFactory.getLogger(getClass()); - - private static final int BUFFER_SIZE = 1024*1024*64; - - public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException { - var wordsFile = openWordsFile(file); - long signature = wordsFile.get(0); - - if (signature == Strategy.BTREE.ordinal()) { - return new BtreeWordsTable(wordsFile); - } - throw new IllegalArgumentException("Unknown signature " + signature); - } - - private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { - return new MultimapFileLong(wordsFile, - FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false); - } - - public abstract long positionForWord(int wordId); - - public abstract int wordLength(int wordId); - public abstract void forEachWordsOffset(LongConsumer offsetConsumer); - - @Override - public void close() throws Exception { - - } - - public record TableWordRange(long start, long end) {} - - public enum Strategy { - FLAT, HASH, BTREE_OLD, BTREE - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java deleted file mode 100644 index 3097dd47..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java +++ /dev/null @@ -1,85 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service.index.wordstable; - -import nu.marginalia.util.btree.BTreeWriter; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.multimap.MultimapFileLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; - -import static nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter.urlsBTreeContext; - -public class WordsTableWriter { - private final long[] table; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8); - - public WordsTableWriter(int length) { - table = new long[length]; - } - - public void acceptWord(int wordId) { - if (wordId >= table.length) { - logger.warn("Invalid word-id {}", wordId); - } - else { - table[wordId]++; - } - } - - public long[] getTable() { - return table; - } - public void write(File file) throws Exception { - - int tableSize = 0; - - if (table[0] != 0) tableSize = 1; - - for (int i = 1; i < table.length; i++) { - if (table[i] != 0) { - tableSize++; - } - table[i] += table[i-1]; - } - - logger.info("Writing table {} words {} max", tableSize, table.length); - - writeBtreeWordsFile(file, table, tableSize); - - } - - private void writeBtreeWordsFile(File outputFileWords, long[] table, int tableSize) throws Exception { - try (var mmf = MultimapFileLong.forOutput(outputFileWords.toPath(), tableSize/8L)) { - mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal()); - long offset = 1; - - var writer = new BTreeWriter(mmf, wordsBTreeContext); - - writer.write(offset, tableSize, (idx) -> { - long urlFileOffset = 0; - - if (table[0] != 0) { - int length = (int) table[0]; - mmf.put(idx++, (long)length<<32); - mmf.put(idx++, 0); - - urlFileOffset += (urlsBTreeContext.calculateSize(length)); - } - - for (int i = 1; i < table.length; i++) { - if (table[i] != table[i - 1]) { - int length = (int)(table[i] - table[i-1]); - mmf.put(idx++, (long)length << 32 | i); - mmf.put(idx++, urlFileOffset); - - urlFileOffset += (urlsBTreeContext.calculateSize(length)); - } - } - }); - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java index 0063efd9..02c7197a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java @@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.*; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.SearchOrder; import java.util.Arrays; import java.util.Collections; @@ -21,14 +20,13 @@ public class EdgeSearchSpecification { public final int limitTotal; public final String humanQuery; - public final SearchOrder searchOrder; public boolean stagger; public boolean experimental; public static EdgeSearchSpecification justIncludes(String... words) { return new EdgeSearchSpecification( IntStream.range(0, DYNAMIC_BUCKET_LENGTH+1).boxed().toList(), - Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", SearchOrder.ASCENDING, false, false); + Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", false, false); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index 10675cc5..66004dee 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.SearchOrder; import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; @@ -136,7 +135,7 @@ public class EdgeSearchOperator { sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block)); - EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", SearchOrder.ASCENDING, EdgeSearchProfile.YOLO.equals(profile), false); + EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", EdgeSearchProfile.YOLO.equals(profile), false); return performQuery(ctx, new EdgeSearchQuery(specs), true); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index 05fcaa04..212d09ab 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -1,7 +1,6 @@ package nu.marginalia.wmsa.edge.search; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.SearchOrder; import java.util.Arrays; import java.util.Collections; @@ -9,27 +8,27 @@ import java.util.List; import java.util.stream.Collectors; public enum EdgeSearchProfile { - DEFAULT("default", SearchOrder.ASCENDING, + DEFAULT("default", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 0, 1), - MODERN("modern", SearchOrder.ASCENDING, + MODERN("modern", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 2), - CORPO("corpo", SearchOrder.ASCENDING, + CORPO("corpo", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 4, 5, 6, 7), - YOLO("yolo", SearchOrder.ASCENDING, + YOLO("yolo", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 0, 2, 1, 3, 4, 6), - CORPO_CLEAN("corpo-clean", SearchOrder.ASCENDING, + CORPO_CLEAN("corpo-clean", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 4, 5), - ACADEMIA("academia", SearchOrder.ASCENDING, + ACADEMIA("academia", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 3), @@ -37,17 +36,15 @@ public enum EdgeSearchProfile { public final String name; - public final SearchOrder order; public final List additionalSearchTerm; public final List buckets; public final List indexBlocks; - EdgeSearchProfile(String name, SearchOrder order, + EdgeSearchProfile(String name, List additionalSearchTerm, List indexBlocks, int... buckets) { this.name = name; - this.order = order; this.additionalSearchTerm = additionalSearchTerm; this.indexBlocks = indexBlocks; this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java index 60520aa9..6e341721 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java @@ -32,7 +32,7 @@ import java.util.regex.Pattern; public class SiteSearchCommand implements SearchCommandInterface { private final EdgeDataStoreDao dataStoreDao; private final EdgeSearchOperator searchOperator; - private DomainInformationService domainInformationService; + private final DomainInformationService domainInformationService; private final Logger logger = LoggerFactory.getLogger(getClass()); private final MustacheRenderer siteInfoRenderer; @@ -91,7 +91,7 @@ public class SiteSearchCommand implements SearchCommandInterface { logger.info("Fetching Site Info: {}", word); var results = domainInformationService.domainInfo(word) - .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); + .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); logger.debug("Results = {}", results); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java index c5c19187..d94ae487 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java @@ -18,7 +18,6 @@ public class DomainInformation { int pagesIndexed; int incomingLinks; int outboundLinks; - double nominalQuality; double ranking; EdgeDomainIndexingState state; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index d3775dd9..1d77a9d0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -138,7 +138,6 @@ public class QueryFactory { .subqueries(subqueries) .limitByBucket(50) .limitTotal(100) - .searchOrder(profile.order) .humanQuery(query) .buckets(profile.buckets); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index 496fe57b..2f79a9ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -57,10 +57,9 @@ public class DomainInformationService { int outboundLinks = getOutboundLinks(domainId); double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; EdgeDomainIndexingState state = getDomainState(domainId); - double nominalQuality = Math.round(100*100*Math.exp(getDomainQuality(domainId)))/100.; List linkingDomains = getLinkingDomains(domainId); - return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains)); + return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, rank, state, linkingDomains)); } private EdgeId getDomainFromPartial(String site) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java index 1251f626..05c67481 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java @@ -8,8 +8,8 @@ import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.index.model.RankingSettings; -import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexDao; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import org.mariadb.jdbc.Driver; import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb index 5696b251..cd8abf67 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb @@ -10,5 +10,4 @@ Pages Known: {{pagesKnown}} Pages Indexed: {{pagesKnown}} Inbound Links: {{inboundLinks}} Outbound Links: {{outboundLinks}} -Nominal Quality: {{nominalQuality}}% Crawl Ranking: {{ranking}}% \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb index 19b585b8..837f320d 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb @@ -37,7 +37,6 @@

Links

- Nominal Quality: {{nominalQuality}}%
Crawl Ranking: {{ranking}}%
Incoming Links: {{incomingLinks}}
Outbound Links: {{outboundLinks}}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java index 1915d989..875cda37 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java @@ -90,10 +90,10 @@ class BTreeWriterTest { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + 2L*i, data[i]); - mmf.put(offset + 2L*i + 1, i); + slice.put(2L*i, data[i]); + slice.put( 2L*i + 1, i); } }); mmf.force(); @@ -133,10 +133,10 @@ class BTreeWriterTest { { var writer = new BTreeWriter(mmf, ctx); - writer.write( 0, toPut.size(), (offset) -> { + writer.write( 0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + 2L*i, data[i]); - mmf.put(offset + 2L*i + 1, i); + slice.put(2L*i, data[i]); + slice.put(2L*i + 1, i); } }); mmf.force(); @@ -182,9 +182,9 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + i, data[i]); + slice.put(i, data[i]); } }); mmf.force(); @@ -235,9 +235,9 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + i, data[i]); + slice.put(i, data[i]); } }); mmf.force(); @@ -288,10 +288,10 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + i*2L, data[i]); - mmf.put(offset + i*2L+1, i); + slice.put(i*2L, data[i]); + slice.put(i*2L+1, i); } }); mmf.force(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java index 326c9b15..9331a998 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java @@ -27,7 +27,7 @@ class LongPairHashMapTest { try { RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); - var lphm = new LongPairHashMap(mmf, 1024); + var lphm = LongPairHashMap.createNew(mmf, 1024); toPut.forEach(i -> { lphm.put(new LongPairHashMap.CellData(i, i)); }); @@ -36,7 +36,7 @@ class LongPairHashMapTest { RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw"); MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); - var lphm2 = new LongPairHashMap(mmf2); + var lphm2 = LongPairHashMap.loadExisting(mmf2); toPut.forEach(i -> { Assertions.assertTrue(lphm2.get(i).isSet()); Assertions.assertEquals(i, (int) lphm2.get(i).getKey()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java index b6e61aa2..961d8304 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java @@ -1,11 +1,11 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java index 6b029da9..2b2da0fd 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java @@ -3,14 +3,14 @@ package nu.marginalia.wmsa.edge.index.service; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.client.exception.RemoteException; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.EdgeIndexService; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.EdgeId; @@ -23,7 +23,6 @@ import org.junit.jupiter.api.parallel.ResourceAccessMode; import org.junit.jupiter.api.parallel.ResourceLock; import spark.Spark; -import java.io.File; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; @@ -31,7 +30,6 @@ import java.util.List; import java.util.stream.Collectors; import static nu.marginalia.util.TestUtil.getConnection; -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java index 6b219bad..edcfa71f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java @@ -1,14 +1,14 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.model.EdgeId; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index 4aa9bceb..65b1ad57 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -13,6 +13,7 @@ class QueryVariantsTest { QueryVariants variants; QueryParser parser; SentenceExtractor se; + @BeforeEach public void setUp() { LanguageModels lm = TestLanguageModels.getLanguageModels(); @@ -24,7 +25,7 @@ class QueryVariantsTest { parser = new QueryParser(new EnglishDictionary(dict), variants); } - @Test + @Test @SuppressWarnings("unchecked") void getQueryVariants() { System.out.println(se.extractSentence("we are alone")); testCase("DOS", List.of("DOS")); @@ -50,7 +51,5 @@ class QueryVariantsTest { private void testCase(String input, List... expected) { var tokens = variants.getQueryVariants(parser.extractBasicTokens(input)); System.out.println(tokens); -// var result = tokens.stream().map(lst -> lst.terms).collect(Collectors.toSet()); -// assertEquals(Set.of(expected), result, "Case failed: " + input); } } \ No newline at end of file From 81c77e7fcb2c5d31f13841462910003d745783e9 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 15 Jun 2022 16:49:18 +0200 Subject: [PATCH 25/27] Revert "Merge branch 'experimental' into master" This reverts commit c3a432fdd42c4c08e271cf0a3c45f589cfe2bfb9, reversing changes made to 1de63f225d9d425ee89741e5a3fa1b00893c5c5b. --- marginalia_nu/build.gradle | 31 +- .../wmsa/edge/EdgeSearchE2ETest.java | 23 +- marginalia_nu/src/e2e/resources/init.sh | 2 +- .../nu/marginalia/util/btree/BTreeWriter.java | 16 +- .../marginalia/util/btree/WriteCallback.java | 4 +- .../util/btree/model/BTreeHeader.java | 3 +- .../marginalia/util/hash/LongPairHashMap.java | 45 +- .../util/multimap/MultimapFileLong.java | 11 +- .../multimap/MultimapFileLongOffsetSlice.java | 70 --- .../util/multimap/MultimapFileLongSlice.java | 29 - .../util/multimap/MultimapSearcher.java | 4 +- .../util/multimap/MultimapSorter.java | 4 +- .../marginalia/util/ranking/AcademiaRank.java | 49 ++ .../util/ranking/BetterReversePageRank.java | 8 +- .../util/ranking/BetterStandardPageRank.java | 8 +- .../util/ranking/BuggyReversePageRank.java | 8 +- .../util/ranking/BuggyStandardPageRank.java | 8 +- .../util/ranking/RankingAlgorithm.java | 303 ++++++--- .../util/ranking/RankingDomainData.java | 33 - .../util/ranking/RankingDomainFetcher.java | 105 ---- .../ranking/old/OldReversePageRankV2.java | 4 +- .../util/ranking/old/StandardPageRank.java | 4 +- .../util/ranking/tool/DedupTool.java | 2 +- .../util/ranking/tool/PerusePageRankV2.java | 4 +- .../ranking/tool/TestAcademiaRankTool.java | 30 + .../ranking/tool/UpdateDomainRanksTool.java | 14 +- .../ranking/tool/UpdateDomainRanksTool2.java | 12 +- .../edge/converting/ReindexTriggerMain.java | 4 +- .../converting/interpreter/Interpreter.java | 2 +- .../instruction/LoadProcessedDomain.java | 4 +- .../wmsa/edge/converting/loader/Loader.java | 6 +- .../converting/loader/SqlLoadDomainLinks.java | 6 +- .../converting/loader/SqlLoadDomains.java | 27 +- .../loader/SqlLoadProcessedDocument.java | 24 +- .../loader/SqlLoadProcessedDomain.java | 27 +- .../edge/converting/loader/SqlLoadUrls.java | 20 +- .../processor/InstructionsCompiler.java | 2 +- .../edge/crawling/CrawlJobExtractorMain.java | 11 +- .../CrawlJobExtractorPageRankMain.java | 15 +- .../wmsa/edge/data/dao/EdgeDataStoreDao.java | 32 +- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 588 ++++++++++++++++-- .../dao/task/EdgeDomainBlacklistImpl.java | 2 +- .../wmsa/edge/index/EdgeIndexControl.java | 9 +- .../wmsa/edge/index/EdgeIndexService.java | 6 +- .../wmsa/edge/index/IndexServicesFactory.java | 16 +- .../words/WordIndexLengthsTable.java | 10 - .../words/WordIndexOffsetsTable.java | 67 -- .../conversion/words/WordIndexTables.java | 56 -- .../conversion/words/WordsTableWriter.java | 75 --- .../index/{ => radix}/EdgeIndexBucket.java | 10 +- .../SearchEngineRanking.java | 2 +- .../SearchIndexDao.java | 43 +- .../{reader => service}/SearchIndexes.java | 10 +- .../wmsa/edge/index/service/SearchOrder.java | 6 + .../dictionary/DictionaryReader.java | 2 +- .../dictionary/DictionaryWriter.java | 2 +- .../dictionary/TokenCompressor.java | 2 +- .../ConversionUnnecessaryException.java | 2 +- .../index}/SearchIndex.java | 6 +- .../index}/SearchIndexConverter.java | 108 ++-- .../index}/SearchIndexPreconverter.java | 3 +- .../index}/SearchIndexReader.java | 10 +- .../index}/SearchIndexWriter.java | 2 +- .../index}/SearchIndexWriterImpl.java | 4 +- .../index/wordstable/BtreeWordsTable.java} | 90 +-- .../index/wordstable/IndexWordsTable.java | 48 ++ .../index/wordstable/WordsTableWriter.java | 85 +++ .../query/IndexQueryBuilder.java | 4 +- .../query/IndexSearchBudget.java | 2 +- .../{reader => service}/query/Query.java | 2 +- .../query}/SearchIndexPartitioner.java | 4 +- .../wmsa/edge/model/EdgeDomain.java | 5 +- .../model/crawl/EdgeDomainIndexingState.java | 31 +- .../model/search/EdgeSearchSpecification.java | 4 +- .../edge/model/search/EdgeUrlDetails.java | 19 +- .../wmsa/edge/search/EdgeSearchOperator.java | 3 +- .../wmsa/edge/search/EdgeSearchProfile.java | 17 +- .../command/commands/SiteSearchCommand.java | 4 +- .../edge/search/model/DomainInformation.java | 1 + .../wmsa/edge/search/query/QueryFactory.java | 1 + .../search/results/SearchResultDecorator.java | 2 +- .../siteinfo/DomainInformationService.java | 226 +------ .../wmsa/edge/tools/IndexMergerMain.java | 9 +- .../main/resources/sql/edge-crawler-cache.sql | 176 ++++-- .../templates/edge/site-info-gmi.hdb | 1 + .../resources/templates/edge/site-info.hdb | 1 + .../java/nu/marginalia/util/TestUtil.java | 2 +- .../util/btree/BTreeWriterTest.java | 26 +- .../util/hash/LongPairHashMapTest.java | 4 +- .../loader/SqlLoadDomainLinksTest.java | 48 -- .../converting/loader/SqlLoadDomainsTest.java | 52 -- .../loader/SqlLoadProcessedDocumentTest.java | 94 --- .../loader/SqlLoadProcessedDomainTest.java | 54 -- .../converting/loader/SqlLoadUrlsTest.java | 50 -- .../index/service/DictionaryWriterTest.java | 8 +- .../index/service/EdgeIndexClientTest.java | 6 +- .../service/SearchIndexConverterTest.java | 89 +++ .../index/service/SearchIndexWriterTest.java | 14 +- .../index/service/TokenCompressorTest.java | 2 +- .../edge/search/query/QueryVariantsTest.java | 5 +- 100 files changed, 1667 insertions(+), 1577 deletions(-) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{ => radix}/EdgeIndexBucket.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service}/SearchEngineRanking.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service}/SearchIndexDao.java (64%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service}/SearchIndexes.java (91%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{ => service}/dictionary/DictionaryReader.java (92%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{ => service}/dictionary/DictionaryWriter.java (99%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{ => service}/dictionary/TokenCompressor.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service/index}/ConversionUnnecessaryException.java (80%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service/index}/SearchIndex.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service/index}/SearchIndexConverter.java (75%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service/index}/SearchIndexPreconverter.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service/index}/SearchIndexReader.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{journal => service/index}/SearchIndexWriter.java (88%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{journal => service/index}/SearchIndexWriterImpl.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader/IndexWordsTable.java => service/index/wordstable/BtreeWordsTable.java} (58%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service}/query/IndexQueryBuilder.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service}/query/IndexSearchBudget.java (87%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => service}/query/Query.java (73%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{conversion => service/query}/SearchIndexPartitioner.java (96%) delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index eb553649..b2115fb0 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -59,12 +59,12 @@ dependencies { implementation "com.sparkjava:spark-core:2.9.3" implementation 'com.opencsv:opencsv:5.6' - implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2' - implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2' - implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2' - implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2' - implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2' - implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' + implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' + implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' + implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' + implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' + implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' implementation 'org.slf4j:slf4j-api:1.7.36' @@ -76,6 +76,7 @@ dependencies { implementation 'com.github.ThatJavaNerd:JRAW:1.1.0' implementation group: 'com.h2database', name: 'h2', version: '2.1.210' + testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.3.1' implementation 'org.jsoup:jsoup:1.14.3' implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2' @@ -85,7 +86,7 @@ dependencies { implementation 'com.zaxxer:HikariCP:5.0.1' - implementation 'org.apache.opennlp:opennlp-tools:1.9.4' + implementation 'org.apache.opennlp:opennlp-tools:1.9.3' implementation 'io.prometheus:simpleclient:0.15.0' implementation 'io.prometheus:simpleclient_servlet:0.15.0' implementation 'io.prometheus:simpleclient_httpserver:0.15.0' @@ -122,19 +123,15 @@ dependencies { testImplementation 'org.projectlombok:lombok:1.18.24' testAnnotationProcessor 'org.projectlombok:lombok:1.18.24' - testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1' - - testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2') - testImplementation 'org.testcontainers:mariadb:1.17.2' - testImplementation "org.testcontainers:junit-jupiter:1.17.2" - e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' e2eTestImplementation 'org.projectlombok:lombok:1.18.24' - e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24' - e2eTestImplementation 'org.testcontainers:nginx:1.17.2' - e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2" - e2eTestImplementation "org.testcontainers:selenium:1.17.2" + e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22' + e2eTestImplementation 'org.testcontainers:mariadb:1.17.1' + e2eTestImplementation 'org.testcontainers:nginx:1.17.1' + e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1' + e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1" + e2eTestImplementation "org.testcontainers:selenium:1.17.1" e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4' e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4' } diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index 08408de2..af43e462 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -28,7 +28,6 @@ import java.util.ArrayList; import java.util.List; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; -import static org.junit.jupiter.api.Assertions.assertEquals; @Tag("e2e") @Testcontainers @@ -157,16 +156,6 @@ public class EdgeSearchE2ETest extends E2ETestBase { return wikipediaFiles.toString(); } - private List getTitlesFromSearchResults(String html) { - List ret = new ArrayList<>(); - - for (var title : Jsoup.parse(html).select(".card.search-result > h2")) { - ret.add(title.text()); - } - - return ret; - } - @Test public void testFrontPage() throws IOException { var driver = chrome.getWebDriver(); @@ -184,9 +173,8 @@ public class EdgeSearchE2ETest extends E2ETestBase { driver.get("http://proxyNginx/search?query=bird&profile=corpo"); System.out.println(driver.getTitle()); + System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - assertEquals(List.of("Bird"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query")); } @@ -199,23 +187,20 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info")); } - @Test public void testSiteSearch() throws IOException { var driver = chrome.getWebDriver(); driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog"); System.out.println(driver.getTitle()); + System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - - assertEquals(List.of("Frog", "Binomial nomenclature", "Amphibian", "Mantis"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search")); } - @Test public void testBrowse() throws IOException { var driver = chrome.getWebDriver(); @@ -224,6 +209,7 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse")); } @Test @@ -234,6 +220,7 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define")); } @Test diff --git a/marginalia_nu/src/e2e/resources/init.sh b/marginalia_nu/src/e2e/resources/init.sh index 50dbd406..5409f787 100644 --- a/marginalia_nu/src/e2e/resources/init.sh +++ b/marginalia_nu/src/e2e/resources/init.sh @@ -69,4 +69,4 @@ memex memex dating dating EOF -WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file +WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java index b43faca7..28ac4914 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java @@ -3,7 +3,6 @@ package nu.marginalia.util.btree; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.util.multimap.MultimapFileLongSlice; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,9 +12,9 @@ import java.io.IOException; public class BTreeWriter { private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class); private final BTreeContext ctx; - private final MultimapFileLongSlice map; + private final MultimapFileLong map; - public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) { + public BTreeWriter(MultimapFileLong map, BTreeContext ctx) { this.map = map; this.ctx = ctx; } @@ -32,18 +31,13 @@ public class BTreeWriter { return size; } - /** Construct a BTree with numEntries entries at offset in the associated map - * - * @return The size of the written data - */ - public long write(long offset, int numEntries, WriteCallback writeIndexCallback) + public long write(long offset, int numEntries, WriteCallback writeIndex) throws IOException { - BTreeHeader header = makeHeader(offset, numEntries); + var header = makeHeader(offset, numEntries); header.write(map, offset); - - writeIndexCallback.write(map.atOffset(header.dataOffsetLongs())); + writeIndex.write(header.dataOffsetLongs()); if (header.layers() < 1) { return ctx.calculateSize(numEntries); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java index a6225db1..70bd8132 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java @@ -1,9 +1,7 @@ package nu.marginalia.util.btree; -import nu.marginalia.util.multimap.MultimapFileLongSlice; - import java.io.IOException; public interface WriteCallback { - void write(MultimapFileLongSlice slice) throws IOException; + void write(long offset) throws IOException; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java index 8d68b424..4951f5b8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java @@ -1,7 +1,6 @@ package nu.marginalia.util.btree.model; import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.util.multimap.MultimapFileLongSlice; public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) { public BTreeHeader { @@ -29,7 +28,7 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon return padding; } - public void write(MultimapFileLongSlice dest, long offset) { + public void write(MultimapFileLong dest, long offset) { dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL)); dest.put(offset+1, indexOffsetLongs); dest.put(offset+2, dataOffsetLongs); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java index d1e056b9..6f8912a9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java @@ -1,7 +1,9 @@ package nu.marginalia.util.hash; +import io.prometheus.client.Gauge; import lombok.EqualsAndHashCode; import lombok.Getter; +import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.PrimeUtil; import org.slf4j.Logger; @@ -15,7 +17,9 @@ import static java.lang.Math.round; */ public class LongPairHashMap { private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class); - private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police + private static final Gauge probe_count_metrics + = Gauge.build("wmsa_wordfile_hash_map_probe_count", "Probing Count") + .register(); private final long hashTableSize; private final MultimapFileLong data; @@ -23,37 +27,26 @@ public class LongPairHashMap { private int sz = 0; private static final int HEADER_SIZE = 2; - private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) { + public LongPairHashMap(MultimapFileLong data, long size) { this.data = data; - this.hashTableSize = hashTableSize; - this.maxProbeLength = maxProbeLength; - } + // Actually use a prime size for Donald Knuth reasons + hashTableSize = PrimeUtil.nextPrime(size, 1); + maxProbeLength = hashTableSize / 2; - public static LongPairHashMap createNew(MultimapFileLong data, long size) { - var tableSize = PrimeUtil.nextPrime(size, 1); - var ret = new LongPairHashMap(data, tableSize, tableSize/2); + logger.debug("Table size = " + hashTableSize); - data.put(0, MAGIC_WORD); - data.put(1, tableSize); - - for (int i = 2; i < tableSize; i++) { + data.put(0, IndexWordsTable.Strategy.HASH.ordinal()); + data.put(1, hashTableSize); + for (int i = 2; i < hashTableSize; i++) { data.put(HEADER_SIZE + 2L*i, 0); } - - return ret; } + public LongPairHashMap(MultimapFileLong data) { + this.data = data; + hashTableSize = data.get(1); + maxProbeLength = hashTableSize / 10; - public static LongPairHashMap loadExisting(MultimapFileLong data) { - long key = data.get(0); - - if (key != MAGIC_WORD) { - logger.warn("LongPairHashMap lacks magic word, could this be garbage data?"); - } - - var hashTableSize = data.get(1); - var maxProbeLength = hashTableSize / 10; - - return new LongPairHashMap(data, hashTableSize, maxProbeLength); + logger.debug("Table size = " + hashTableSize); } public int size() { @@ -98,6 +91,8 @@ public class LongPairHashMap { final var val = getCell(idx); if (!val.isSet()) { + probe_count_metrics.set(j); + return setValue(data, idx); } else if (val.getKey() == data.getKey()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java index f381a977..dca8248e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java @@ -21,7 +21,7 @@ import static java.nio.channels.FileChannel.MapMode.READ_WRITE; import static nu.marginalia.util.FileSizeUtil.readableSize; -public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { +public class MultimapFileLong implements AutoCloseable { private final ArrayList buffers = new ArrayList<>(); private final ArrayList mappedByteBuffers = new ArrayList<>(); @@ -196,12 +196,10 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } } - @Override public long size() { return fileLength; } - @Override public void put(long idx, long val) { if (idx >= mappedSize) grow(idx); @@ -216,7 +214,6 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } } - @Override public long get(long idx) { if (idx >= mappedSize) grow(idx); @@ -232,12 +229,10 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } - @Override public void read(long[] vals, long idx) { read(vals, vals.length, idx); } - @Override public void read(long[] vals, int n, long idx) { if (idx+n >= mappedSize) { grow(idx+n); @@ -262,12 +257,10 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } - @Override public void write(long[] vals, long idx) { write(vals, vals.length, idx); } - @Override public void write(long[] vals, int n, long idx) { if (idx+n >= mappedSize) { grow(idx+n); @@ -292,7 +285,6 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } - @Override public void write(LongBuffer vals, long idx) { int n = vals.limit() - vals.position(); if (idx+n >= mappedSize) { @@ -318,7 +310,6 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } - @Override public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException { int length = (int)(sourceEnd - sourceStart); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java deleted file mode 100644 index c2630ddc..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java +++ /dev/null @@ -1,70 +0,0 @@ -package nu.marginalia.util.multimap; - -import java.io.IOException; -import java.nio.LongBuffer; -import java.nio.channels.FileChannel; - -public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice { - private final long off; - private final MultimapFileLongSlice map; - - public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) { - this.off = off; - this.map = map; - } - - @Override - public long size() { - return map.size() - off; - } - - @Override - public void put(long idx, long val) { - map.put(off+idx, val); - } - - @Override - public long get(long idx) { - return map.get(off+idx); - } - - @Override - public void read(long[] vals, long idx) { - map.read(vals, idx+off); - } - - @Override - public void read(long[] vals, int n, long idx) { - map.read(vals, n, idx+off); - } - - @Override - public void write(long[] vals, long idx) { - map.write(vals, idx+off); - } - - @Override - public void write(long[] vals, int n, long idx) { - map.write(vals, n, idx+off); - } - - @Override - public void write(LongBuffer vals, long idx) { - map.write(vals, idx+off); - } - - @Override - public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) - throws IOException { - map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd); - } - - @Override - public MultimapFileLongSlice atOffset(long off) { - // If we don't override this, the default implementation would build a pyramid of - // MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...))) - // if this is called iteratively (e.g. to walk over a file) - - return new MultimapFileLongOffsetSlice(map, this.off + off); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java deleted file mode 100644 index abf29f51..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java +++ /dev/null @@ -1,29 +0,0 @@ -package nu.marginalia.util.multimap; - -import java.io.IOException; -import java.nio.LongBuffer; -import java.nio.channels.FileChannel; - -public interface MultimapFileLongSlice { - long size(); - - void put(long idx, long val); - - long get(long idx); - - void read(long[] vals, long idx); - - void read(long[] vals, int n, long idx); - - void write(long[] vals, long idx); - - void write(long[] vals, int n, long idx); - - void write(LongBuffer vals, long idx); - - void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException; - - default MultimapFileLongSlice atOffset(long off) { - return new MultimapFileLongOffsetSlice(this, off); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java index 005888d8..c961ac0e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java @@ -4,9 +4,9 @@ import lombok.experimental.Delegate; public class MultimapSearcher { @Delegate - private final MultimapFileLongSlice mmf; + private final MultimapFileLong mmf; - public MultimapSearcher(MultimapFileLongSlice mmf) { + public MultimapSearcher(MultimapFileLong mmf) { this.mmf = mmf; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java index 61dd04c4..6ca4f64f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java @@ -13,10 +13,10 @@ import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE; public class MultimapSorter { private final Path tmpFileDir; private final int internalSortLimit; - private final MultimapFileLongSlice multimapFileLong; + private final MultimapFileLong multimapFileLong; private final long[] buffer; - public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) { + public MultimapSorter(MultimapFileLong multimapFileLong, Path tmpFileDir, int internalSortLimit) { this.multimapFileLong = multimapFileLong; this.tmpFileDir = tmpFileDir; this.internalSortLimit = internalSortLimit; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java new file mode 100644 index 00000000..272a1798 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java @@ -0,0 +1,49 @@ +package nu.marginalia.util.ranking; + +import com.zaxxer.hikari.HikariDataSource; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; +import gnu.trove.map.hash.TIntIntHashMap; +import it.unimi.dsi.fastutil.ints.IntArrays; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.sql.SQLException; + +public class AcademiaRank { + private final TIntArrayList result; + private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class); + + public AcademiaRank(HikariDataSource ds, String... origins) throws IOException { + + TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000); + TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000); + + for (int i = 0; i < rankingResults.size(); i++) { + idToRanking.put(rankingResults.get(i), i); + } + + result = new TIntArrayList(10000); + try (var conn = ds.getConnection(); + var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) { + + stmt.setFetchSize(1000); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + result.add(rsp.getInt(1)); + } + } + catch (SQLException ex) { + logger.error("SQL error", ex); + } + + int[] internalArray = result.toArray(); + IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b)); + result.set(0, internalArray); + } + + public TIntArrayList getResult() { + return result; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java index 7d3b17c4..f2889ad6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java @@ -1,11 +1,15 @@ package nu.marginalia.util.ranking; +import com.zaxxer.hikari.HikariDataSource; + +import java.io.IOException; + public class BetterReversePageRank extends RankingAlgorithm { - public BetterReversePageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); + public BetterReversePageRank(HikariDataSource dataSource, String... origins) { + super(dataSource, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java index f1f9b0b1..5b64fa73 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java @@ -1,10 +1,14 @@ package nu.marginalia.util.ranking; +import com.zaxxer.hikari.HikariDataSource; + +import java.io.IOException; + public class BetterStandardPageRank extends RankingAlgorithm { - public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); + public BetterStandardPageRank(HikariDataSource dataSource, String... origins) { + super(dataSource, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java index 485ba353..1e87776c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java @@ -1,11 +1,15 @@ package nu.marginalia.util.ranking; +import com.zaxxer.hikari.HikariDataSource; + +import java.io.IOException; + public class BuggyReversePageRank extends RankingAlgorithm { - public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); + public BuggyReversePageRank(HikariDataSource dataSource, String... origins) { + super(dataSource, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java index 836bcdfe..a3d7b87e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java @@ -1,10 +1,14 @@ package nu.marginalia.util.ranking; +import com.zaxxer.hikari.HikariDataSource; + +import java.io.IOException; + public class BuggyStandardPageRank extends RankingAlgorithm { - public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); + public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) { + super(dataSource, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java index 4d255087..fd76989c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java @@ -1,129 +1,224 @@ package nu.marginalia.util.ranking; +import com.zaxxer.hikari.HikariDataSource; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; +import gnu.trove.set.hash.TIntHashSet; import it.unimi.dsi.fastutil.ints.IntComparator; +import lombok.AllArgsConstructor; +import lombok.Data; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.sql.SQLException; import java.util.*; import java.util.function.IntToDoubleFunction; import java.util.stream.IntStream; import it.unimi.dsi.fastutil.ints.IntArrays; public abstract class RankingAlgorithm { - protected final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); - protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); - protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); + final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); + final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); + final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); - protected TIntArrayList[] linkDataSrc2Dest; - protected TIntArrayList[] linkDataDest2Src; + private final TIntHashSet spamDomains; + private final HikariDataSource dataSource; + + TIntArrayList[] linkDataSrc2Dest; + TIntArrayList[] linkDataDest2Src; public final Set originDomains = new HashSet<>(); public final Set originDomainIds = new HashSet<>(); private int maxKnownUrls = Integer.MAX_VALUE; + private static final boolean getNames = true; + private final Logger logger = LoggerFactory.getLogger(getClass()); - private final RankingDomainFetcher domains; + public static void main(String... args) throws IOException { + var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com"); + var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); - public RankingAlgorithm(RankingDomainFetcher domains, String... origins) { - this.domains = domains; - - originDomains.addAll(Arrays.asList(origins)); - - domains.getDomains(domainData -> { - int id = domainData.id; - - domainsById.put(id, domainData); - - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); + var rankVector = spr.pageRankVector(); + var norm = rankVector.norm(); + rpr.pageRank(i -> rankVector.get(i) / norm, 25).forEach(i -> { + System.out.println(spr.domainNameFromId(i)); + return true; }); - - linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; - linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; - - domains.eachDomainLink((src, dst) -> { - if (src == dst) return; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } - }); - - for (var namePattern : this.originDomains) { - domains.domainsByPattern(namePattern, i -> { - int ival = domainIdToIndex.get(i); - if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) { - originDomainIds.add(ival); - } - else { - logger.debug("No value for {}", i); - } - }); - } - logger.info("Origin Domains: {}", originDomainIds.size()); } - public void addPeripheralNodes() { + public String domainNameFromId(int id) { + return domainsById.get(id).name; + } + public boolean isPeripheral(int id) { + return domainsById.get(id).peripheral; + } + + public RankingAlgorithm(HikariDataSource dataSource, String... origins) { + this.dataSource = dataSource; + var blacklist = new EdgeDomainBlacklistImpl(dataSource); + + spamDomains = blacklist.getSpamDomains(); + originDomains.addAll(Arrays.asList(origins)); + + try (var conn = dataSource.getConnection()) { + + String s; + if (getNames) { + s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + } + else { + s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + } + try (var stmt = conn.prepareStatement(s)) { + stmt.setFetchSize(10000); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + int id = rsp.getInt(1); + if (!spamDomains.contains(id)) { + + domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false)); + + domainIndexToId.put(domainIndexToId.size(), id); + domainIdToIndex.put(id, domainIdToIndex.size()); + } + } + } + + + linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; + linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; + + try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { + stmt.setFetchSize(10000); + + var rsp = stmt.executeQuery(); + + while (rsp.next()) { + int src = rsp.getInt(1); + int dst = rsp.getInt(2); + + if (src == dst) continue; + + if (domainsById.contains(src) && domainsById.contains(dst)) { + + int srcIdx = domainIdToIndex.get(src); + int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); + + if (linkDataSrc2Dest[srcIdx] == null) { + linkDataSrc2Dest[srcIdx] = new TIntArrayList(); + } + linkDataSrc2Dest[srcIdx].add(dstIdx); + + if (linkDataDest2Src[dstIdx] == null) { + linkDataDest2Src[dstIdx] = new TIntArrayList(); + } + linkDataDest2Src[dstIdx].add(srcIdx); + } + } + } + + try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) { + for (var seed : this.originDomains) { + stmt.setString(1, seed); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + int i = rsp.getInt(1); + int ival = domainIdToIndex.get(i); + if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) { + originDomainIds.add(ival); + } + else { + logger.debug("No value for {}", i); + } + } + logger.debug("{} -> {}", seed, originDomainIds.size()); + } + } + + logger.info("Origin Domains: {}", originDomainIds.size()); + + } catch (SQLException throwables) { + logger.error("SQL error", throwables); + } + } + + public void addPeripheralNodes(boolean includeErrorStates) { int newNodesIdxCutoff = domainIdToIndex.size(); logger.info("Inserting peripheral nodes"); - domains.getPeripheralDomains(domainData -> { - int id = domainData.id; - - if (domainsById.put(id, domainData) == null) { // true if id was not already present - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); + try (var conn = dataSource.getConnection()) { + String s; + if (getNames) { + s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; } - }); - - linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size()); - linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size()); - - domains.eachDomainLink((src, dst) -> { - if (src == dst) return; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - // This looks like a bug, but it improves the results - if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff) - return; - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); + else { + s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; } - }); + try (var stmt = conn.prepareStatement(s)) { + stmt.setFetchSize(10000); + var rsp = stmt.executeQuery(); + + while (rsp.next()) { + int id = rsp.getInt(1); + + if (!spamDomains.contains(id)) { + domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true)); + + domainIndexToId.put(domainIndexToId.size(), id); + domainIdToIndex.put(id, domainIdToIndex.size()); + } + } + + } + + linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size()); + linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size()); + + try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { + stmt.setFetchSize(10000); + + var rsp = stmt.executeQuery(); + + while (rsp.next()) { + int src = rsp.getInt(1); + int dst = rsp.getInt(2); + + if (src == dst) continue; + + if (domainsById.contains(src) && domainsById.contains(dst)) { + + int srcIdx = domainIdToIndex.get(src); + int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); + + // This looks like a bug, but it improves the results + if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff) + continue; + + if (linkDataSrc2Dest[srcIdx] == null) { + linkDataSrc2Dest[srcIdx] = new TIntArrayList(); + } + linkDataSrc2Dest[srcIdx].add(dstIdx); + + if (linkDataDest2Src[dstIdx] == null) { + linkDataDest2Src[dstIdx] = new TIntArrayList(); + } + linkDataDest2Src[dstIdx].add(srcIdx); + } + } + } + } catch (SQLException throwables) { + logger.error("SQL error", throwables); + } logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size()); } @@ -176,14 +271,14 @@ public abstract class RankingAlgorithm { return rank.getRanking(resultCount); } - public TIntList pageRankWithPeripheralNodes(int resultCount) { + public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; for (int i = 0; i < iter_max; i++) { if (i == iter_max-1) { - addPeripheralNodes(); + addPeripheralNodes(includeErrorStates); } RankVector newRank = createNewRankVector(rank); @@ -228,7 +323,7 @@ public abstract class RankingAlgorithm { abstract RankVector createNewRankVector(RankVector rank); - public boolean includeInRanking(RankingDomainData data) { + public boolean includeInRanking(DomainData data) { if (data.isAlias()) return false; if (data.isSpecial()) @@ -350,4 +445,32 @@ public abstract class RankingAlgorithm { } } + @Data + @AllArgsConstructor + static class DomainData { + public final int id; + public final String name; + private int alias; + private int state; + public final int knownUrls; + public boolean peripheral; + + public int resolveAlias() { + if (alias == 0) return id; + return alias; + } + + public boolean isAlias() { + return alias != 0; + } + + public boolean isSpecial() { + return EdgeDomainIndexingState.SPECIAL.code == state; + } + + public boolean isSocialMedia() { + return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state; + } + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java deleted file mode 100644 index c29ed704..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java +++ /dev/null @@ -1,33 +0,0 @@ -package nu.marginalia.util.ranking; - -import lombok.AllArgsConstructor; -import lombok.Data; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; - -@Data -@AllArgsConstructor -class RankingDomainData { - public final int id; - public final String name; - private int alias; - private EdgeDomainIndexingState state; - public final int knownUrls; - public boolean peripheral; - - public int resolveAlias() { - if (alias == 0) return id; - return alias; - } - - public boolean isAlias() { - return alias != 0; - } - - public boolean isSpecial() { - return EdgeDomainIndexingState.SPECIAL == state; - } - - public boolean isSocialMedia() { - return EdgeDomainIndexingState.SOCIAL_MEDIA == state; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java deleted file mode 100644 index 79285a83..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java +++ /dev/null @@ -1,105 +0,0 @@ -package nu.marginalia.util.ranking; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.function.Consumer; -import java.util.function.IntConsumer; - -public class RankingDomainFetcher { - private final HikariDataSource dataSource; - private final EdgeDomainBlacklistImpl blacklist; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final boolean getNames = false; - - @Inject - public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { - this.dataSource = dataSource; - this.blacklist = blacklist; - } - - public void getDomains(Consumer consumer) { - String query; - if (getNames) { - query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; - } - else { - query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; - } - - getDomains(query, consumer); - } - - - public void getPeripheralDomains(Consumer consumer) { - String query; - if (getNames) { - query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; - } - else { - query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; - } - - getDomains(query, consumer); - } - - private void getDomains(String query, Consumer consumer) { - try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - if (!blacklist.isBlacklisted(id)) { - consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false)); - } - } - } - catch (SQLException ex) { - logger.error("Failed to fetch domains", ex); - } - } - - public void eachDomainLink(DomainLinkConsumer consumer) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) - { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - consumer.accept(src, dst); - } - } - catch (SQLException ex) { - logger.error("Failed to fetch domain links", ex); - } - } - - public void domainsByPattern(String pattern, IntConsumer idConsumer) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) { - stmt.setString(1, pattern); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - idConsumer.accept(rsp.getInt(1)); - } - } - catch (SQLException ex) { - logger.error("Failed to fetch domains by pattern", ex); - } - } - - public interface DomainLinkConsumer { - void accept(int from, int to); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java index 02823563..6a214278 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java @@ -66,7 +66,7 @@ public class OldReversePageRankV2 { originDomains.add("memex.marginalia.nu"); try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) { + try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) { stmt.setFetchSize(10000); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -90,7 +90,7 @@ public class OldReversePageRankV2 { } } - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { + try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { stmt.setFetchSize(10000); for (var seed : this.originDomains) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java index 74bef70a..c42b28dd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java @@ -48,7 +48,7 @@ public class StandardPageRank { originDomains.addAll(Arrays.asList(origins)); try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) { + try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) { stmt.setFetchSize(10000); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -78,7 +78,7 @@ public class StandardPageRank { } } - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { + try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { for (var seed : this.originDomains) { stmt.setString(1, seed); var rsp = stmt.executeQuery(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java index d6f95f51..a5ea8b06 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java @@ -50,7 +50,7 @@ public class DedupTool { Map>> domainToHashToUrl = new HashMap<>(); try (var conn = ds.getConnection(); - var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL"); + var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL"); var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?") ) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java index 3f3ce6a5..85a691c2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java @@ -112,10 +112,10 @@ public class PerusePageRankV2 { try (var conn = dataSource.getConnection()) { String s; if (getNames) { - s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID"; } else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID"; + s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID"; } try (var stmt = conn.prepareStatement(s)) { stmt.setFetchSize(10000); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java new file mode 100644 index 00000000..38192b35 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java @@ -0,0 +1,30 @@ +package nu.marginalia.util.ranking.tool; + +import lombok.SneakyThrows; +import nu.marginalia.util.ranking.AcademiaRank; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import org.mariadb.jdbc.Driver; + +import java.io.IOException; + +public class TestAcademiaRankTool { + + @SneakyThrows + public static void main(String... args) { + Driver driver = new Driver(); + var conn = new DatabaseModule().provideConnection(); + + var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu"); + var res = rank.getResult(); + + try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) { + for (int i = 0; i < Math.min(res.size(), 100); i++) { + stmt.setInt(1, res.getQuick(i)); + var rsp = stmt.executeQuery(); + while (rsp.next()) + System.out.println(rsp.getString(1)); + } + } + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java index f80d307f..71ec72a6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java @@ -3,13 +3,12 @@ package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BuggyStandardPageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.sql.SQLException; import java.util.HashSet; import java.util.Set; @@ -44,14 +43,12 @@ public class UpdateDomainRanksTool { var uploader = new Thread(() -> uploadThread(conn), "Uploader"); logger.info("Ranking"); - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu"); + var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu"); rankMax = spr.size()*2; uploader.start(); - spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { + spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { @@ -86,6 +83,11 @@ public class UpdateDomainRanksTool { } } + logger.info("Recalculating quality"); + try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) { + stmt.executeUpdate(); + } + } catch (SQLException | InterruptedException throwables) { throwables.printStackTrace(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java index f46fb390..336b35fd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java @@ -3,13 +3,12 @@ package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.sql.SQLException; import java.util.HashSet; import java.util.Set; @@ -46,9 +45,7 @@ public class UpdateDomainRanksTool2 { logger.info("Ranking"); // "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com", // "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net" - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); // var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu"); // var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); @@ -61,7 +58,7 @@ public class UpdateDomainRanksTool2 { rankMax = rpr.size(); - rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { + rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { @@ -97,6 +94,9 @@ public class UpdateDomainRanksTool2 { } logger.info("Recalculating quality"); + try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) { + stmt.executeUpdate(); + } } catch (SQLException | InterruptedException throwables) { throwables.printStackTrace(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java index 55648dfd..050152bc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java @@ -29,7 +29,7 @@ public class ReindexTriggerMain { .build(); try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) { - var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); + var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); while (rs.next()) { System.out.printf("%d %s %s %d\n", rs.getInt(1), @@ -38,7 +38,7 @@ public class ReindexTriggerMain { rs.getInt(4)); } - rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100"); + rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100"); while (rs.next()) { System.out.printf("%d %d %s %d %s\n", rs.getInt(1), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java index c0698dde..8755716c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java @@ -14,7 +14,7 @@ public interface Interpreter { void loadRssFeed(EdgeUrl[] rssFeed); void loadDomainLink(DomainLink[] links); - void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip); + void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality); void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java index 2b1fd631..065d6211 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java @@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction { +public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction { @Override public void apply(Interpreter interpreter) { - interpreter.loadProcessedDomain(domain, state, ip); + interpreter.loadProcessedDomain(domain, state, quality); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java index 49a39457..140a762a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java @@ -76,9 +76,9 @@ public class Loader implements Interpreter { } @Override - public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) { - logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip); - sqlLoadProcessedDomain.load(data, domain, state, ip); + public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) { + logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality); + sqlLoadProcessedDomain.load(data, domain, state, quality); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java index 6750bd33..e0978828 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java @@ -30,7 +30,7 @@ public class SqlLoadDomainLinks { INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID) SELECT SOURCE.ID,DEST.ID FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST - ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN; + ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN; END """); } @@ -61,8 +61,8 @@ public class SqlLoadDomainLinks { } } } - catch (SQLException ex) { - logger.warn("SQL error inserting domain links", ex); + catch (SQLException sql) { + sql.printStackTrace(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java index 76a839c9..18cc40bd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java @@ -25,9 +25,15 @@ public class SqlLoadDomains { stmt.execute(""" CREATE PROCEDURE INSERT_DOMAIN ( IN DOMAIN_NAME VARCHAR(255), + IN SUB_DOMAIN VARCHAR(255), IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci) BEGIN - INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN); + INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN); + + INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID) + SELECT DOMAIN_NAME,SUB_DOMAIN,ID + FROM EC_TOP_DOMAIN + WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN; END """); } @@ -40,9 +46,10 @@ public class SqlLoadDomains { public void load(LoaderData data, EdgeDomain domain) { try (var connection = dataSource.getConnection()) { - try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) { + try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) { insertCall.setString(1, domain.toString()); - insertCall.setString(2, domain.domain); + insertCall.setString(2, domain.subDomain); + insertCall.setString(3, domain.domain); insertCall.addBatch(); var ret = insertCall.executeUpdate(); @@ -50,11 +57,12 @@ public class SqlLoadDomains { logger.warn("load({}) -- bad row count {}", domain, ret); } + connection.commit(); findIdForTargetDomain(connection, data); } } catch (SQLException ex) { - logger.warn("SQL error inserting domain", ex); + ex.printStackTrace(); } @@ -65,11 +73,12 @@ public class SqlLoadDomains { try (var connection = dataSource.getConnection()) { connection.setAutoCommit(false); - try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) { + try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) { for (var domain : domains) { insertCall.setString(1, domain.toString()); - insertCall.setString(2, domain.domain); + insertCall.setString(2, domain.subDomain); + insertCall.setString(3, domain.domain); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -86,7 +95,7 @@ public class SqlLoadDomains { findIdForTargetDomain(connection, data); } catch (SQLException ex) { - logger.warn("SQL error inserting domains", ex); + ex.printStackTrace(); } } @@ -95,7 +104,7 @@ public class SqlLoadDomains { return; } - try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) + try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { var targetDomain = data.getTargetDomain(); @@ -109,7 +118,7 @@ public class SqlLoadDomains { } } catch (SQLException ex) { - logger.warn("SQL error finding id for domain", ex); + ex.printStackTrace(); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index e2e25fff..b25a657b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -31,14 +31,14 @@ public class SqlLoadProcessedDocument { IN TITLE VARCHAR(255), IN DESCRIPTION VARCHAR(255), IN LENGTH INT, + IN QUALITY_MEASURE DOUBLE, IN FEATURES INT, IN STANDARD VARCHAR(32), - IN QUALITY DOUBLE, IN HASH INT) BEGIN SET FOREIGN_KEY_CHECKS=0; - REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY); - UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; + REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES); + UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID; SET FOREIGN_KEY_CHECKS=1; END """); @@ -47,8 +47,7 @@ public class SqlLoadProcessedDocument { IN URL_ID INT, IN STATE VARCHAR(32)) BEGIN - UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; - DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID; + UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID; END """); @@ -62,7 +61,6 @@ public class SqlLoadProcessedDocument { public void load(LoaderData data, List documents) { try (var conn = dataSource.getConnection(); var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) { - conn.setAutoCommit(false); for (var doc : documents) { int urlId = data.getUrlId(doc.url()); @@ -76,9 +74,9 @@ public class SqlLoadProcessedDocument { stmt.setString(3, doc.title()); stmt.setString(4, doc.description()); stmt.setInt(5, doc.length()); - stmt.setInt(6, doc.htmlFeatures()); - stmt.setString(7, doc.standard().name()); - stmt.setDouble(8, doc.quality()); + stmt.setDouble(6, doc.quality()); + stmt.setInt(7, doc.htmlFeatures()); + stmt.setString(8, doc.standard().name()); stmt.setInt(9, (int) doc.hash()); stmt.addBatch(); } @@ -91,8 +89,8 @@ public class SqlLoadProcessedDocument { } conn.commit(); - } catch (SQLException ex) { - logger.warn("SQL error inserting document", ex); + } catch (SQLException e) { + e.printStackTrace(); } @@ -119,8 +117,8 @@ public class SqlLoadProcessedDocument { logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]); } } - } catch (SQLException ex) { - logger.warn("SQL error inserting failed document", ex); + } catch (SQLException e) { + e.printStackTrace(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java index 018d76c9..64607b3a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java @@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain { stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN"); stmt.execute(""" CREATE PROCEDURE INITIALIZE_DOMAIN ( - IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'), + IN ST INT, IN IDX INT, - IN DID INT, - IN IP VARCHAR(32)) + IN QUAL DOUBLE, + IN DID INT) BEGIN - UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; + UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; END """); @@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain { } } - public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) { + public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) { data.setTargetDomain(domain); loadDomains.load(data, domain); @@ -49,17 +49,18 @@ public class SqlLoadProcessedDomain { try (var conn = dataSource.getConnection(); var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)")) { - initCall.setString(1, state.name()); + initCall.setInt(1, state.code); initCall.setInt(2, 1 + data.sizeHint / 100); - initCall.setInt(3, data.getDomainId(domain)); - initCall.setString(4, ip); + initCall.setDouble(3, quality); + initCall.setInt(4, data.getDomainId(domain)); int rc = initCall.executeUpdate(); if (rc < 1) { - logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); + logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc); } + conn.commit(); } catch (SQLException ex) { - logger.warn("SQL error initializing domain", ex); + ex.printStackTrace(); } } @@ -68,9 +69,9 @@ public class SqlLoadProcessedDomain { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" UPDATE EC_DOMAIN TARGET - INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=? + INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=? SET TARGET.DOMAIN_ALIAS=ALIAS.ID - WHERE TARGET.DOMAIN_NAME=? + WHERE TARGET.URL_PART=? """)) { stmt.setString(1, link.to().toString()); stmt.setString(2, link.from().toString()); @@ -80,7 +81,7 @@ public class SqlLoadProcessedDomain { } } catch (SQLException ex) { - logger.warn("SQL error inserting domain alias", ex); + ex.printStackTrace(); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index ba9ae43a..7d8851ca 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -1,13 +1,11 @@ package nu.marginalia.wmsa.edge.converting.loader; -import com.google.common.hash.Hashing; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.nio.charset.StandardCharsets; import java.sql.SQLException; import java.sql.Types; @@ -27,13 +25,12 @@ public class SqlLoadUrls { stmt.execute(""" CREATE PROCEDURE INSERT_URL ( IN PROTO VARCHAR(255), - IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, + IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN PORT INT, - IN PATH VARCHAR(255), - IN PATH_HASH BIGINT + IN URL VARCHAR(255) ) BEGIN - INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; + INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME; END """); } @@ -45,8 +42,8 @@ public class SqlLoadUrls { public void load(LoaderData data, EdgeUrl[] urls) { try (var conn = dataSource.getConnection(); - var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)"); - var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?") + var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)"); + var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?") ) { conn.setAutoCommit(false); @@ -61,7 +58,6 @@ public class SqlLoadUrls { insertCall.setNull(3, Types.INTEGER); } insertCall.setString(4, url.path); - insertCall.setLong(5, hashPath(url.path)); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -90,11 +86,7 @@ public class SqlLoadUrls { } catch (SQLException ex) { - logger.warn("SQL error inserting URLs", ex); + ex.printStackTrace(); } } - - private long hashPath(String path) { - return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong(); - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java index b75de436..d36cb830 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java @@ -15,7 +15,7 @@ public class InstructionsCompiler { public List compile(ProcessedDomain domain) { List ret = new ArrayList<>(domain.size()*4); - ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); + ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.))); if (domain.documents != null) { compileUrls(ret, domain.documents); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java index 52fe338a..2f25d6d7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java @@ -34,10 +34,11 @@ public class CrawlJobExtractorMain { private static final String domainsSql = """ - SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME) + SELECT ID, LOWER(EC_DOMAIN.URL_PART) FROM EC_DOMAIN - WHERE INDEXED>0 - AND STATE='ACTIVE' OR STATE='EXHAUSTED' + WHERE QUALITY_RAW>-100 + AND INDEXED>0 + AND STATE<2 ORDER BY INDEX_DATE ASC, DISCOVER_DATE ASC, @@ -48,8 +49,8 @@ public class CrawlJobExtractorMain { private static final String urlsSql = """ - SELECT URL - FROM EC_URL_VIEW + SELECT CONCAT(PROTO, "://", ?, URL) + FROM EC_URL WHERE DOMAIN_ID=? ORDER BY VISITED DESC, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java index ea1946fc..21935fd0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java @@ -6,7 +6,6 @@ import com.google.common.hash.Hashing; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; @@ -31,19 +30,19 @@ public class CrawlJobExtractorPageRankMain { """ SELECT ID FROM EC_DOMAIN - WHERE DOMAIN_NAME=? + WHERE URL_PART=? """; private static final String specificDomainSqlFromId = """ - SELECT LOWER(DOMAIN_NAME) + SELECT LOWER(URL_PART) FROM EC_DOMAIN WHERE ID=? """; private static final String urlsSql = """ - SELECT URL - FROM EC_URL_VIEW + SELECT CONCAT(PROTO, "://", ?, URL) + FROM EC_URL WHERE DOMAIN_ID=? ORDER BY VISITED DESC, @@ -74,12 +73,10 @@ public class CrawlJobExtractorPageRankMain { Gson gson = new GsonBuilder().create(); - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); rpr.setMaxKnownUrls(750); - var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size()); + var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false); try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java index 2f309b07..81e8dd58 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java @@ -13,14 +13,44 @@ import java.util.Optional; @ImplementedBy(EdgeDataStoreDaoImpl.class) public interface EdgeDataStoreDao { + boolean isBlacklisted(EdgeDomain domain); + EdgeId getDomainId(EdgeDomain domain); + EdgeId getUrlId(EdgeUrl domain); + EdgeUrl getUrl(EdgeId id); + EdgeUrlDetails getUrlDetails(EdgeId id); + List getDomainNeighbors(EdgeId domainId, EdgeDomainBlacklist backlist, int count); List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist backlist, int count); - List getRandomDomains(int count, EdgeDomainBlacklist backlist); List getUrlDetailsMulti(List> ids); + List> getDomainIdsFromUrlIds(Collection> urlIds); + EdgeDomain getDomain(EdgeId id); + List> inboudUrls(EdgeId id, int limit); + List> outboundUrls(EdgeId id, int limit); + Optional> resolveAmbiguousDomain(String name); + + + int getPagesKnown(EdgeId domainId); + int getPagesVisited(EdgeId domainId); + int getPagesIndexed(EdgeId domainId); + + int getIncomingLinks(EdgeId domainId); + int getOutboundLinks(EdgeId domainId); + + double getDomainQuality(EdgeId domainId); + + EdgeDomainIndexingState getDomainState(EdgeId domainId); + + List getLinkingDomains(EdgeId domainId); + + List getNewUrls(EdgeId domainId, Collection links); + + double getRank(EdgeId domainId); + + void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 30ea2256..a214bb15 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -33,6 +33,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { private final Cache> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build(); private final Cache> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); + private static final String DEFAULT_PROTOCOL = "http"; public static double QUALITY_LOWER_BOUND_CUTOFF = -15.; @Inject public EdgeDataStoreDaoImpl(HikariDataSource dataSource) @@ -47,13 +48,30 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { domainIdCache.invalidateAll(); } + @SneakyThrows + @Override + public boolean isBlacklisted(EdgeDomain domain) { + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { + stmt.setString(1, domain.domain); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return true; + } else { + return false; + } + } + } + } + @SneakyThrows @Override public EdgeId getDomainId(EdgeDomain domain) { try (var connection = dataSource.getConnection()) { return domainIdCache.get(domain, () -> { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { stmt.setString(1, domain.toString()); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -68,14 +86,104 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - private String idList(List> ids) { - StringJoiner j = new StringJoiner(",", "(", ")"); - for (var id : ids) { - j.add(Integer.toString(id.getId())); + @Override + @SneakyThrows + public EdgeId getUrlId(EdgeUrl url) { + try (var connection = dataSource.getConnection()) { + + return urlIdCache.get(url, () -> { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) { + stmt.setString(1, url.path); + stmt.setString(2, url.domain.toString()); + stmt.setString(3, url.proto); + + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return new EdgeId<>(rsp.getInt(1)); + } + } + // Lenient mode for http->https upgrades etc + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) { + stmt.setString(1, url.path); + stmt.setString(2, url.domain.toString()); + + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return new EdgeId<>(rsp.getInt(1)); + } + } + throw new NoSuchElementException(url.toString()); + }); + } + catch (UncheckedExecutionException ex) { + throw ex.getCause(); } - return j.toString(); } + + @SneakyThrows + @Override + public List> getDomainIdsFromUrlIds(Collection> urlIds) { + List> results = new ArrayList<>(urlIds.size()); + + if (urlIds.isEmpty()) + return results; + + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds + .stream() + .map(EdgeId::getId) + .map(Object::toString) + .collect(Collectors.joining(",", "(", ")")))) + { + var rsp = stmt.executeQuery(); + while (rsp.next()) { + results.add(new EdgeId<>(rsp.getInt(1))); + } + + } + } + + return results; + } + + static final Pattern badChars = Pattern.compile("[';\\\\]"); + private String saneString(String s) { + return "\'"+badChars.matcher(s).replaceAll("?")+"\'"; + } + @SneakyThrows + @Override + public EdgeUrl getUrl(EdgeId id) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.createStatement()) { + var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId()); + if (rsp.next()) { + return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4)); + } + throw new NoSuchElementException(); + } + } + } + + @SneakyThrows + @Override + public EdgeUrlDetails getUrlDetails(EdgeId id) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.createStatement()) { + var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId()); + if (rsp.next()) { + EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5)); + return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17)); + } + throw new NoSuchElementException(); + } + } + } + + @SneakyThrows @Override public List getUrlDetailsMulti(List> ids) { @@ -85,39 +193,16 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { List result = new ArrayList<>(ids.size()); try (var connection = dataSource.getConnection()) { + // This is SQL-injection safe, the IDs are of type int + String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")")); - String idString = idList(ids); - - try (var stmt = connection.prepareStatement( - """ - SELECT ID, URL, - TITLE, DESCRIPTION, - QUALITY, - WORDS_TOTAL, FORMAT, FEATURES, - IP, DOMAIN_STATE, - DATA_HASH - FROM EC_URL_VIEW WHERE ID IN - """ + idString)) { + try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) { stmt.setFetchSize(ids.size()); var rsp = stmt.executeQuery(); while (rsp.next()) { - EdgeUrl url = new EdgeUrl(rsp.getString(2)); - var val = new EdgeUrlDetails(rsp.getInt(1), url, - rsp.getString(3), // title - rsp.getString(4), // description - rsp.getDouble(5), // quality - rsp.getInt(6), // wordsTotal - rsp.getString(7), // format - rsp.getInt(8), // features - rsp.getString(9), // ip - EdgeDomainIndexingState.valueOf(rsp.getString(10)), // domainState - rsp.getInt(11), // dataHash - EdgePageScoreAdjustment.zero(), // urlQualityAdjustment - Integer.MAX_VALUE, // rankingId - Double.MAX_VALUE, // termScore - 0 // queryLength - ); + EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5)); + var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17)); if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) { result.add(val); } @@ -129,13 +214,82 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { return result; } + @Override + public List getDomainNeighbors(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { + final Set domains = new HashSet<>(count*3); + + final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?"; + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement(q)) { + stmt.setFetchSize(count); + stmt.setInt(1, domainId.getId()); + stmt.setInt(2, count); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + int id = rsp.getInt(1); + String domain = rsp.getString(2); + + if (!blacklist.isBlacklisted(id)) { + var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); + + domains.add(new BrowseResult(url, id)); + } + } + } + + final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?"; + try (var stmt = connection.prepareStatement(q2)) { + + stmt.setFetchSize(count); + stmt.setInt(1, domainId.getId()); + stmt.setInt(2, count); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + int id = rsp.getInt(1); + String domain = rsp.getString(2); + + if (!blacklist.isBlacklisted(id)) { + var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); + + domains.add(new BrowseResult(url, id)); + } + } + } + + final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?"; + try (var stmt = connection.prepareStatement(q3)) { + stmt.setFetchSize(count); + stmt.setInt(1, domainId.getId()); + stmt.setInt(2, count); + + var rsp = stmt.executeQuery(); + while (rsp.next()) { + int id = rsp.getInt(1); + String domain = rsp.getString(2); + + if (!blacklist.isBlacklisted(id)) { + var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); + + domains.add(new BrowseResult(url, id)); + } + } + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + + + return new ArrayList<>(domains); + } + @Override public List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { final Set domains = new HashSet<>(count*3); final String q = """ - SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT + SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART, COUNT(*) AS CNT FROM EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID @@ -162,14 +316,16 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); + var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); + + domains.add(new BrowseResult(url, id)); } } } if (domains.size() < count/2) { final String q2 = """ - SELECT EC_DOMAIN.ID, DOMAIN_NAME + SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID @@ -191,7 +347,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); + var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); + + domains.add(new BrowseResult(url, id)); } } } @@ -199,11 +357,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { if (domains.size() < count/2) { final String q3 = """ - SELECT EC_DOMAIN.ID, DOMAIN_NAME - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + SELECT EC_DOMAIN.ID, URL_PART + FROM EC_DOMAIN + INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID - INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID + INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE B.DEST_DOMAIN_ID=? AND STATE<2 AND KNOWN_URLS<1000 @@ -223,7 +381,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); + var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); + + domains.add(new BrowseResult(url, id)); } } } @@ -239,15 +399,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { @Override public List getRandomDomains(int count, EdgeDomainBlacklist blacklist) { - final String q = """ - SELECT DOMAIN_ID, DOMAIN_NAME - FROM EC_RANDOM_DOMAINS - INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID - WHERE STATE<2 - AND DOMAIN_ALIAS IS NULL - ORDER BY RAND() - LIMIT ? - """; + final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?"; List domains = new ArrayList<>(count); try (var conn = dataSource.getConnection()) { try (var stmt = conn.prepareStatement(q)) { @@ -258,7 +410,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); + var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); + + domains.add(new BrowseResult(url, id)); } } } @@ -274,7 +428,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { public EdgeDomain getDomain(EdgeId id) { try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { + try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) { stmt.setInt(1, id.getId()); var rsp = stmt.executeQuery(); if (rsp.next()) { @@ -285,4 +439,330 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } + @Override @SneakyThrows + public List> inboudUrls(EdgeId id, int limit) { + + List> ret = new ArrayList<>(); + try (var connection = dataSource.getConnection()) { + + try (var stmt = + connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) { + stmt.setFetchSize(limit); + stmt.setInt(1, id.getId()); + stmt.setInt(2, limit); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + ret.add(new EdgeId<>(rsp.getInt(1))); + } + } + + } + + return ret; + } + + + @Override @SneakyThrows + public List> outboundUrls(EdgeId id, int limit) { + + List> ret = new ArrayList<>(); + try (var connection = dataSource.getConnection()) { + + try (var stmt = + connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) { + stmt.setFetchSize(limit); + stmt.setInt(1, id.getId()); + stmt.setInt(2, limit); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + ret.add(new EdgeId<>(rsp.getInt(1))); + } + } + + } + + return ret; + } + + @Override + public Optional> resolveAmbiguousDomain(String name) { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + stmt.setString(1, name); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return Optional.of(new EdgeId<>(rsp.getInt(1))); + } + } + + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + stmt.setString(1, "https://"+name); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return Optional.of(new EdgeId<>(rsp.getInt(1))); + } + } + + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + stmt.setString(1, "http://"+name); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return Optional.of(new EdgeId<>(rsp.getInt(1))); + } + } + + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + stmt.setString(1, "https://www."+name); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return Optional.of(new EdgeId<>(rsp.getInt(1))); + } + } + + try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) { + stmt.setString(1, "http://www."+name); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return Optional.of(new EdgeId<>(rsp.getInt(1))); + } + } + + } catch (SQLException throwables) { + logger.info("Could not resolve domain id for {}", name); + } + + return Optional.empty(); + } + + @SneakyThrows + @Override + public int getPagesKnown(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + @Override + public int getPagesVisited(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + + @SneakyThrows + @Override + public int getPagesIndexed(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + @Override + public int getIncomingLinks(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + @SneakyThrows + @Override + public int getOutboundLinks(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + @Override + public double getDomainQuality(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getDouble(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return -5; + } + } + + @Override + public EdgeDomainIndexingState getDomainState(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return EdgeDomainIndexingState.fromCode(rsp.getInt(1)); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return EdgeDomainIndexingState.ERROR; + } + + @Override + public List getLinkingDomains(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + List results = new ArrayList<>(25); + try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + results.add(new EdgeDomain(rsp.getString(1))); + } + return results; + } catch (Exception ex) { + logger.error("DB error", ex); + } + + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return Collections.emptyList(); + } + + @Override + public List getNewUrls(EdgeId domainId, Collection links) { + Map edgeUrlByPath = links.stream().collect(Collectors.toMap(EdgeUrl::getPath, Function.identity(), (a,b)->a)); + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement("SELECT URL FROM EC_URL WHERE DOMAIN_ID=?")) { + stmt.setFetchSize(500); + stmt.setInt(1, domainId.getId()); + var rs = stmt.executeQuery(); + while (rs.next()) { + edgeUrlByPath.remove(rs.getString(1)); + } + } + } + catch (Exception ex) { + return Collections.emptyList(); + } + return new ArrayList<>(edgeUrlByPath.values()); + + } + + @Override + public double getRank(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getDouble(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return 1; + } + + @Override + public void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed) { + try (var connection = dataSource.getConnection(); + var stmt = connection.prepareStatement("UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=?, DOMAIN_ALIAS=?, INDEXED=GREATEST(INDEXED,?) WHERE ID=?")) { + stmt.setInt(1, state.code); + if (null == alias) { + stmt.setNull(2, Types.INTEGER); + } + else { + stmt.setInt(2, getDomainId(alias).getId()); + } + + stmt.setInt(3, minIndexed); + stmt.setInt(4, getDomainId(domain).getId()); + stmt.executeUpdate(); + connection.commit(); + } + catch (SQLException throwables) { + logger.error("SQL error", throwables); + } + } + + @SneakyThrows + private double getDomainQuality(Connection connection, EdgeDomain src) { + try (var stmt = connection.prepareStatement("SELECT QUALITY_RAW FROM EC_DOMAIN WHERE URL_PART=?")) { + stmt.setString(1, src.toString()); + var res = stmt.executeQuery(); + + if (res.next()) { + var q = res.getDouble(1); + if (q > 0.5) { + logger.warn("gDQ({}) -> 1", src); + } + return 0; + } + } + catch (SQLException ex) { + logger.error("DB error", ex); + } + + return -5; + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java index 334ec5a9..f4cbb8d0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java @@ -50,7 +50,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist { final TIntHashSet result = new TIntHashSet(1_000_000); try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) { + try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) { stmt.setFetchSize(1000); var rsp = stmt.executeQuery(); while (rsp.next()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index ab7c73fe..b590af55 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -3,9 +3,7 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.Inject; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; - -import java.io.IOException; +import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException; public class EdgeIndexControl { @@ -29,10 +27,7 @@ public class EdgeIndexControl { System.gc(); } catch (ConversionUnnecessaryException unnecessary) { - // swallow quietly - } - catch (IOException e) { - e.printStackTrace(); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index de6276a8..a04a4c83 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -15,9 +15,9 @@ import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.edge.index.model.*; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.service.SearchIndexes; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index 61e64b41..fb58ac0e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -5,16 +5,12 @@ import com.google.inject.Singleton; import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; +import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; +import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.service.index.*; +import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -93,7 +89,7 @@ public class IndexServicesFactory { } - public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { + public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException { return new SearchIndexConverter(block, id, tmpFileDir, preconverterOutputFile.get(id), indexWriteWordsFile.get(id, block.id), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java deleted file mode 100644 index 464e9388..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion.words; - -public class WordIndexLengthsTable { - final long[] table; - - public WordIndexLengthsTable(int size) { - this.table = new long[size]; - } - public void increment(int idx) { table[idx]++; } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java deleted file mode 100644 index 29b88509..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java +++ /dev/null @@ -1,67 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion.words; - -import java.io.IOException; - -public class WordIndexOffsetsTable { - final long[] table; - public final int numberOfUsedWords; - - public WordIndexOffsetsTable(long[] table, int numberOfUsedWords) { - - this.table = table; - this.numberOfUsedWords = numberOfUsedWords; - } - - public int length() { - return table.length; - } - - public void forEach(OffsetTableEntryConsumer o) throws IOException { - if (table[0] > 0) { - o.accept(0, (int) table[0]); - } - - for (int i = 1; i < table.length; i++) { - long start = table[i-1]; - int length = (int) (table[i] - start); - - if (length != 0) { - o.accept(start, length); - } - } - } - - /** - * Fold over each span in the file, left to right - */ - public long fold(OffsetTableEntryFoldConsumer o) throws IOException { - long total = 0; - - if (table[0] > 0) { - total = o.accept(total,0, (int) table[0]); - } - - for (int i = 1; i < table.length; i++) { - long start = table[i-1]; - int length = (int) (table[i] - start); - - if (length != 0) { - total += o.accept(total, start, length); - } - } - - return total; - } - - public long get(int i) { - return table[i]; - } - - public interface OffsetTableEntryConsumer { - void accept(long start, int length) throws IOException; - } - - public interface OffsetTableEntryFoldConsumer { - long accept(long accumulator, long start, int length) throws IOException; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java deleted file mode 100644 index 2056948b..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java +++ /dev/null @@ -1,56 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion.words; - -/** Contains a stateful table of word index offsets, initially in lengths mode - * where the table contains how many postings exist for each word; then in offsets - * mode, where the lengths are converted into the necessary offsets for each block - * of document data. - * - * Caveat! This uses the same underlying array to conserve space. - * - */ -public class WordIndexTables { - private WordIndexLengthsTable lengthsTable; - private WordIndexOffsetsTable offsetsTable; - - private boolean converted = false; - - public WordIndexTables(int size) { - lengthsTable = new WordIndexLengthsTable(size); - } - - public WordIndexLengthsTable lengths() { - if (converted) throw new IllegalStateException("Table has been converted"); - - return lengthsTable; - } - - public WordIndexOffsetsTable offsets() { - if (!converted) throw new IllegalStateException("Table has not been converted"); - - return offsetsTable; - } - - public void convert() { - if (converted) throw new IllegalStateException("Table has been converted"); - - // Go from lengths to offsets, i.e. - // BEFORE: 1, 2, 1, 3, 0, 2 - // AFTER: 1, 3, 4, 7, 7, 9 - - long[] table = lengthsTable.table; - int numberOfUsedWords = 0; - - if (table[0] != 0) numberOfUsedWords = 1; - - for (int i = 1; i < table.length; i++) { - if (table[i] != 0) { - numberOfUsedWords++; - } - table[i] += table[i-1]; - } - - lengthsTable = null; - offsetsTable = new WordIndexOffsetsTable(table, numberOfUsedWords); - converted = true; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java deleted file mode 100644 index 7f762ff3..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java +++ /dev/null @@ -1,75 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion.words; - -import nu.marginalia.util.btree.BTreeWriter; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.util.multimap.MultimapFileLongSlice; -import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; - -import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext; - -public class WordsTableWriter { - private final WordIndexTables table; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8); - - public WordsTableWriter(int length) { - table = new WordIndexTables(length); - } - - public void acceptWord(int wordId) { - table.lengths().increment(wordId); - } - - public WordIndexOffsetsTable getTable() { - return table.offsets(); - } - - public void write(File file) throws IOException { - table.convert(); - - logger.info("Writing table - {} max", table.offsets().numberOfUsedWords); - - final int tableSize = table.offsets().numberOfUsedWords; - - try (var mmf = MultimapFileLong.forOutput(file.toPath(), tableSize/8L)) { - mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal()); - long offset = 1; - - var writer = new BTreeWriter(mmf, wordsBTreeContext); - - writer.write(offset, tableSize, this::writeBTreeBlock); - } - } - - private void writeBTreeBlock(MultimapFileLongSlice mapSlice) { - long urlFileOffset = 0; - int idx = 0; - - var offsetTable = table.offsets().table; - - if (offsetTable[0] != 0) { - int length = (int) offsetTable[0]; - mapSlice.put(idx++, (long)length<<32); - mapSlice.put(idx++, 0); - - urlFileOffset += (urlsBTreeContext.calculateSize(length)); - } - - for (int i = 1; i < offsetTable.length; i++) { - final int length = (int)(offsetTable[i] - offsetTable[i-1]); - - if (length > 0) { - mapSlice.put(idx++, (long)length << 32 | i); - mapSlice.put(idx++, urlFileOffset); - - urlFileOffset += (urlsBTreeContext.calculateSize(length)); - } - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java index 05bcfe75..2e8fdcd2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index; +package nu.marginalia.wmsa.edge.index.radix; import nu.marginalia.wmsa.edge.index.EdgeIndexControl; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriter; -import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.reader.query.Query; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriter; +import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.service.query.Query; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java index 220a9708..abaced82 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.conversion; +package nu.marginalia.wmsa.edge.index.service; import gnu.trove.list.TIntList; import gnu.trove.map.hash.TIntIntHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java similarity index 64% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java index fcf6d175..615fbc34 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.conversion; +package nu.marginalia.wmsa.edge.index.service; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -10,7 +10,7 @@ import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterStandardPageRank; import nu.marginalia.util.ranking.BuggyStandardPageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.index.model.RankingSettings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,28 +18,41 @@ import org.slf4j.LoggerFactory; @Singleton public class SearchIndexDao { private final HikariDataSource dataSource; - private RankingDomainFetcher rankingDomains; private final RankingSettings rankingSettings; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject public SearchIndexDao(HikariDataSource dataSource, - RankingDomainFetcher rankingDomains, RankingSettings rankingSettings) { this.dataSource = dataSource; - this.rankingDomains = rankingDomains; this.rankingSettings = rankingSettings; logger.info("SearchIndexDao ranking settings = {}", rankingSettings); } + @SneakyThrows + public TIntHashSet getSpamDomains() { + final TIntHashSet result = new TIntHashSet(1_000_000); + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) { + var rsp = stmt.executeQuery(); + while (rsp.next()) { + result.add(rsp.getInt(1)); + } + } + } + + return result; + } + @SneakyThrows public TIntHashSet goodUrls() { TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1); TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1); try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND STATE>=0")) { stmt.setFetchSize(10_000); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -66,36 +79,36 @@ public class SearchIndexDao { @SneakyThrows public TIntList getRetroDomains() { - var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2); + var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2, false); } @SneakyThrows public TIntList getSmallWebDomains() { - var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new)); + var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new)); rpr.setMaxKnownUrls(750); - return rpr.pageRankWithPeripheralNodes(rpr.size()); + return rpr.pageRankWithPeripheralNodes(rpr.size(), false); } @SneakyThrows public TIntList getAcademiaDomains() { - var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2); + var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2, false); } @SneakyThrows public TIntList getStandardDomains() { - var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2); + var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2, false); } @SneakyThrows public TIntList getSpecialDomains() { TIntArrayList results = new TIntArrayList(); try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'") + var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2") ) { var rs = stmt.executeQuery(); while (rs.next()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java index 863c0c65..91065101 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.reader; +package nu.marginalia.wmsa.edge.index.service; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; +import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java new file mode 100644 index 00000000..d1c9f10a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java @@ -0,0 +1,6 @@ +package nu.marginalia.wmsa.edge.index.service; + +public enum SearchOrder { + ASCENDING, + REVERSED +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryReader.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryReader.java index ca10c000..90d270d2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryReader.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.dictionary; +package nu.marginalia.wmsa.edge.index.service.dictionary; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryWriter.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryWriter.java index 906231be..9ce1b149 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/DictionaryWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.dictionary; +package nu.marginalia.wmsa.edge.index.service.dictionary; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/TokenCompressor.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/TokenCompressor.java index 5a3d73ab..9f26fffd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/dictionary/TokenCompressor.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.dictionary; +package nu.marginalia.wmsa.edge.index.service.dictionary; import nu.marginalia.util.ByteFolder; import nu.marginalia.util.dict.DictionaryHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java index 2242f476..fd7f529f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.conversion; +package nu.marginalia.wmsa.edge.index.service.index; public class ConversionUnnecessaryException extends Exception { public ConversionUnnecessaryException() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java index 042f8f54..17e62437 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java @@ -1,18 +1,20 @@ -package nu.marginalia.wmsa.edge.index.reader; +package nu.marginalia.wmsa.edge.index.service.index; import com.google.inject.Inject; import com.google.inject.name.Named; import com.upserve.uppend.blobs.NativeIO; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.multimap.MultimapFileLong; +import org.eclipse.jetty.util.thread.ThreadPool; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; +import java.util.concurrent.ForkJoinPool; import java.util.stream.LongStream; public class SearchIndex implements AutoCloseable { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java index 0827b4e7..c9b69386 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.conversion; +package nu.marginalia.wmsa.edge.index.service.index; import com.google.inject.Inject; import com.google.inject.name.Named; @@ -6,10 +6,9 @@ import gnu.trove.set.hash.TIntHashSet; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter; +import nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter; +import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import nu.marginalia.util.btree.BTreeWriter; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.multimap.MultimapFileLong; @@ -33,24 +32,18 @@ public class SearchIndexConverter { private final long fileLength; private final long urlsFileSize; - private final Path tmpFileDir; - private final FileChannel urlsTmpFileChannel; private final int wordCount; private final MultimapFileLong urlsTmpFileMap; private final Logger logger = LoggerFactory.getLogger(getClass()); private final IndexBlock block; private final int bucketId; - - + @org.jetbrains.annotations.NotNull private final File urlsFile; private final SearchIndexPartitioner partitioner; private final TIntHashSet spamDomains; private final MultimapSorter urlTmpFileSorter; - private final static int internalSortLimit = - Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256; - @SneakyThrows public static long wordCount(File inputFile) { try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) { @@ -59,6 +52,7 @@ public class SearchIndexConverter { } } + @SneakyThrows @Inject public SearchIndexConverter(IndexBlock block, int bucketId, @Named("tmp-file-dir") Path tmpFileDir, @@ -67,15 +61,13 @@ public class SearchIndexConverter { @Named("edge-index-write-urls-file") File outputFileUrls, SearchIndexPartitioner partitioner, EdgeDomainBlacklist blacklist) - throws ConversionUnnecessaryException, IOException + throws ConversionUnnecessaryException { this.block = block; this.bucketId = bucketId; - this.tmpFileDir = tmpFileDir; - this.urlsFile = outputFileUrls; + urlsFile = outputFileUrls; this.partitioner = partitioner; this.spamDomains = blacklist.getSpamDomains(); - logger.info("Converting {} ({}) {}", block.id, block, inputFile); Files.deleteIfExists(outputFileWords.toPath()); @@ -97,16 +89,18 @@ public class SearchIndexConverter { urlsFileSize = getUrlsSize(buffer, inputChannel); var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); + + var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); urlsTmpFileChannel = urlsTmpFileRaf.getChannel(); urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false); - urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); + urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256); logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block); - WordIndexOffsetsTable wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); + long[] wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block); - createUrlTable(buffer, raf, wordIndexTable); + createUrlTable(tmpFileDir, buffer, raf, wordIndexTable); Files.delete(tmpUrlsFile); raf.close(); @@ -146,69 +140,99 @@ public class SearchIndexConverter { return reader.size; } - private void createUrlTable(ByteBuffer buffer, RandomAccessFile raf, WordIndexOffsetsTable wordOffsetsTable) throws IOException { - logger.info("Table size = {}", wordOffsetsTable.length()); - + private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException { + logger.debug("Table size = {}", wordIndexTable.length); + int[] wordIndex = new int[wordIndexTable.length]; raf.seek(FILE_HEADER_SIZE); var channel = raf.getChannel(); try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) { - int[] wordWriteOffset = new int[wordOffsetsTable.length()]; - - new IndexReader(buffer, channel) { + var reader = new IndexReader(buffer, channel) { @Override public void eachWord(long urlId, int wordId) throws IOException { - if (wordId >= wordWriteOffset.length) + if (wordId >= wordIndex.length) return; + if (wordId != 0) { + if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) { + logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}", + wordId, + wordIndex[wordId], + wordIndexTable[wordId - 1], + wordIndexTable[wordId]); + throw new IllegalStateException(); + } + } if (wordId > 0) { - rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, translateUrl(urlId)); + rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId)); } else { - rwf.put(wordWriteOffset[wordId]++, translateUrl(urlId)); + rwf.put(wordIndex[wordId]++, translateUrl(urlId)); } } - }.read(); + }; + + reader.read(); rwf.write(urlsTmpFileChannel); } urlsTmpFileChannel.force(false); - logger.info("URL TMP Table: {} Mb", channel.position()/(1024*1024)); - if (wordOffsetsTable.length() > 0) { - logger.info("Sorting urls table"); - - wordOffsetsTable.forEach(urlTmpFileSorter::sort); + logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024)); + if (wordIndexTable.length > 0) { + logger.debug("Sorting urls table"); + sortUrls(wordIndexTable); urlsTmpFileMap.force(); } else { logger.warn("urls table empty -- nothing to sort"); } - logger.info("Writing BTree"); + + long idx = 0; + try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) { var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); - wordOffsetsTable.fold((accumulatorIdx, start, length) -> { - // Note: The return value is accumulated into accumulatorIdx! + if (wordIndexTable[0] != 0) { + int start = 0; + int end = (int) wordIndexTable[0]; - return writer.write(accumulatorIdx, length, - slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length)); - }); + idx += writer.write(idx, (int) wordIndexTable[0], + offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); + } + for (int i = 1; i < wordIndexTable.length; i++) { + if (wordIndexTable[i] != wordIndexTable[i - 1]) { + long start = wordIndexTable[i-1]; + long end = wordIndexTable[i]; + + idx += writer.write(idx, (int) (end-start), + offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); + } + } } catch (Exception e) { - logger.error("Error while writing BTree", e); + e.printStackTrace(); } } - private WordIndexOffsetsTable createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws IOException { + @SneakyThrows + private void sortUrls(long[] wordIndices) { + urlTmpFileSorter.sort( 0, (int) wordIndices[0]); + + for (int i = 1; i < wordIndices.length; i++) { + urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1])); + } + } + + private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception { inputChannel.position(FILE_HEADER_SIZE); logger.debug("Table size = {}", wordCount); WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount); - ByteBuffer buffer = ByteBuffer.allocateDirect(8* SearchIndexWriterImpl.MAX_BLOCK_SIZE); + ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE); logger.debug("Reading words"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java index 9e851025..5149b546 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java @@ -1,9 +1,10 @@ -package nu.marginalia.wmsa.edge.index.conversion; +package nu.marginalia.wmsa.edge.index.service.index; import com.google.inject.Inject; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java index 8e7fea81..7baeb8ae 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.reader; +package nu.marginalia.wmsa.edge.index.service.index; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder; -import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.reader.query.Query; +import nu.marginalia.wmsa.edge.index.service.query.IndexQueryBuilder; +import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.service.query.Query; import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -105,8 +105,10 @@ public class SearchIndexReader implements AutoCloseable { .mapToLong(idx -> idx.numUrls(word)) .sum() ); + } + public IndexBlock getBlockForResult(int searchTerm, long urlId) { for (var block : indicesBySearchOrder) { var index = indices.get(block); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java index 11fc186a..ca5d70b3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.journal; +package nu.marginalia.wmsa.edge.index.service.index; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeDomain; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java index cf76ada2..2f482815 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.index.journal; +package nu.marginalia.wmsa.edge.index.service.index; import io.reactivex.rxjava3.disposables.Disposable; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java similarity index 58% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java index 2bde1aa7..0a6a70c0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java @@ -1,80 +1,36 @@ -package nu.marginalia.wmsa.edge.index.reader; +package nu.marginalia.wmsa.edge.index.service.index.wordstable; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; import java.util.function.LongConsumer; -import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wordsBTreeContext; +import static nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter.wordsBTreeContext; -public class IndexWordsTable implements AutoCloseable { - protected final MultimapFileLong words; - protected final BTreeReader reader; - protected final BTreeHeader header; - protected final int HEADER_OFFSET = 1; - final Logger logger = LoggerFactory.getLogger(getClass()); +public class BtreeWordsTable extends IndexWordsTable{ + private final MultimapFileLong words; + private final BTreeReader reader; + private final BTreeHeader header; + private final int HEADER_OFFSET = 1; - private static final int BUFFER_SIZE = 1024*1024*64; - - public IndexWordsTable(MultimapFileLong words) { + public BtreeWordsTable(MultimapFileLong words) { this.words = words; + reader = new BTreeReader(words, wordsBTreeContext); header = reader.getHeader(HEADER_OFFSET); madvise(); } - public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException { - var wordsFile = openWordsFile(file); - long signature = wordsFile.get(0); - - if (signature == Strategy.BTREE.ordinal()) { - return new IndexWordsTable(wordsFile); - } - - throw new IllegalArgumentException("Unknown signature " + signature); - } - - private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { - return new MultimapFileLong(wordsFile, - FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false); - } - - public long positionForWord(int wordId) { - - long offset = reader.offsetForEntry(header, wordId); - if (offset < 0) { - return -1L; - } - - return words.get(offset+1); - } - - public int wordLength(int wordId) { - - long offset = reader.offsetForEntry(header, wordId); - if (offset < 0) { - return -1; - } - - return (int)(words.get(offset) >> 32); - } - - protected void madvise() { + private void madvise() { words.advice(NativeIO.Advice.Random); words.advice0(NativeIO.Advice.WillNeed); var h = reader.getHeader(HEADER_OFFSET); int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs()); - words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); words.pokeRange(h.indexOffsetLongs(), length); } @@ -102,13 +58,31 @@ public class IndexWordsTable implements AutoCloseable { } } + @Override + public long positionForWord(int wordId) { + + long offset = reader.offsetForEntry(header, wordId); + if (offset < 0) { + return -1L; + } + + return words.get(offset+1); + } + + @Override + public int wordLength(int wordId) { + + long offset = reader.offsetForEntry(header, wordId); + if (offset < 0) { + return -1; + } + + return (int)(words.get(offset) >> 32); + } + @Override public void close() throws Exception { words.close(); } - public enum Strategy { - BTREE - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java new file mode 100644 index 00000000..5b557db1 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java @@ -0,0 +1,48 @@ +package nu.marginalia.wmsa.edge.index.service.index.wordstable; + +import nu.marginalia.util.multimap.MultimapFileLong; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.channels.FileChannel; +import java.util.function.LongConsumer; + +public abstract class IndexWordsTable implements AutoCloseable { + final Logger logger = LoggerFactory.getLogger(getClass()); + + private static final int BUFFER_SIZE = 1024*1024*64; + + public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException { + var wordsFile = openWordsFile(file); + long signature = wordsFile.get(0); + + if (signature == Strategy.BTREE.ordinal()) { + return new BtreeWordsTable(wordsFile); + } + throw new IllegalArgumentException("Unknown signature " + signature); + } + + private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { + return new MultimapFileLong(wordsFile, + FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false); + } + + public abstract long positionForWord(int wordId); + + public abstract int wordLength(int wordId); + public abstract void forEachWordsOffset(LongConsumer offsetConsumer); + + @Override + public void close() throws Exception { + + } + + public record TableWordRange(long start, long end) {} + + public enum Strategy { + FLAT, HASH, BTREE_OLD, BTREE + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java new file mode 100644 index 00000000..3097dd47 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java @@ -0,0 +1,85 @@ +package nu.marginalia.wmsa.edge.index.service.index.wordstable; + +import nu.marginalia.util.btree.BTreeWriter; +import nu.marginalia.util.btree.model.BTreeContext; +import nu.marginalia.util.multimap.MultimapFileLong; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; + +import static nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter.urlsBTreeContext; + +public class WordsTableWriter { + private final long[] table; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8); + + public WordsTableWriter(int length) { + table = new long[length]; + } + + public void acceptWord(int wordId) { + if (wordId >= table.length) { + logger.warn("Invalid word-id {}", wordId); + } + else { + table[wordId]++; + } + } + + public long[] getTable() { + return table; + } + public void write(File file) throws Exception { + + int tableSize = 0; + + if (table[0] != 0) tableSize = 1; + + for (int i = 1; i < table.length; i++) { + if (table[i] != 0) { + tableSize++; + } + table[i] += table[i-1]; + } + + logger.info("Writing table {} words {} max", tableSize, table.length); + + writeBtreeWordsFile(file, table, tableSize); + + } + + private void writeBtreeWordsFile(File outputFileWords, long[] table, int tableSize) throws Exception { + try (var mmf = MultimapFileLong.forOutput(outputFileWords.toPath(), tableSize/8L)) { + mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal()); + long offset = 1; + + var writer = new BTreeWriter(mmf, wordsBTreeContext); + + writer.write(offset, tableSize, (idx) -> { + long urlFileOffset = 0; + + if (table[0] != 0) { + int length = (int) table[0]; + mmf.put(idx++, (long)length<<32); + mmf.put(idx++, 0); + + urlFileOffset += (urlsBTreeContext.calculateSize(length)); + } + + for (int i = 1; i < table.length; i++) { + if (table[i] != table[i - 1]) { + int length = (int)(table[i] - table[i-1]); + mmf.put(idx++, (long)length << 32 | i); + mmf.put(idx++, urlFileOffset); + + urlFileOffset += (urlsBTreeContext.calculateSize(length)); + } + } + }); + } + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java index 6f54dd2d..be217057 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.index.reader.query; +package nu.marginalia.wmsa.edge.index.service.query; import com.google.common.collect.Streams; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; import java.util.Collection; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java index 3608f70a..2ec30e65 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.reader.query; +package nu.marginalia.wmsa.edge.index.service.query; public class IndexSearchBudget { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java index 5f343d54..09f7701b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.reader.query; +package nu.marginalia.wmsa.edge.index.service.query; import java.util.stream.LongStream; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java index bf5a1d74..cf281116 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java @@ -1,9 +1,11 @@ -package nu.marginalia.wmsa.edge.index.conversion; +package nu.marginalia.wmsa.edge.index.service.query; import com.google.inject.Inject; import com.google.inject.Singleton; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.index.service.SearchEngineRanking; +import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index d1945c9e..53740c95 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -55,11 +55,8 @@ public class EdgeDomain implements WideHashable { } } } - } - public EdgeUrl toRootUrl() { - // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http - return new EdgeUrl("http", this, null, "/"); + } public String toString() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java index b10d0e88..119da59d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java @@ -1,12 +1,27 @@ package nu.marginalia.wmsa.edge.model.crawl; public enum EdgeDomainIndexingState { - ACTIVE, - EXHAUSTED, - SPECIAL, - SOCIAL_MEDIA, - BLOCKED, - REDIR, - ERROR, - UNKNOWN + ACTIVE(0), + EXHAUSTED(1), + SPECIAL(2), + SOCIAL_MEDIA(3), + BLOCKED(-1), + REDIR(-2), + ERROR(-3), + UNKNOWN(-100); + + public final int code; + + EdgeDomainIndexingState(int code) { + this.code = code; + } + + public static EdgeDomainIndexingState fromCode(int code) { + for (var state : values()) { + if (state.code == code) { + return state; + } + } + return UNKNOWN; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java index 02c7197a..0063efd9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.*; import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.service.SearchOrder; import java.util.Arrays; import java.util.Collections; @@ -20,13 +21,14 @@ public class EdgeSearchSpecification { public final int limitTotal; public final String humanQuery; + public final SearchOrder searchOrder; public boolean stagger; public boolean experimental; public static EdgeSearchSpecification justIncludes(String... words) { return new EdgeSearchSpecification( IntStream.range(0, DYNAMIC_BUCKET_LENGTH+1).boxed().toList(), - Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", false, false); + Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", SearchOrder.ASCENDING, false, false); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java index d46aa79e..ed5fd013 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java @@ -16,24 +16,25 @@ public class EdgeUrlDetails { public String description; public double urlQuality; + public double urlQualityRaw; + public double domainQuality; + public int links; // DEAD public int words; public String format; public int features; - - - public String ip; // BROKEN - public EdgeDomainIndexingState domainState; - - - public int dataHash; - public EdgePageScoreAdjustment urlQualityAdjustment; + public long rankingId; public double termScore; + + public String ip; // BROKEN + public int domainState; public int queryLength; + public int dataHash; + public long rankingIdAdjustment() { int penalty = 0; @@ -135,7 +136,7 @@ public class EdgeUrlDetails { return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES); } public boolean isSpecialDomain() { - return domainState == EdgeDomainIndexingState.SPECIAL; + return domainState == EdgeDomainIndexingState.SPECIAL.code; } public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index 66004dee..10675cc5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -11,6 +11,7 @@ import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.service.SearchOrder; import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; @@ -135,7 +136,7 @@ public class EdgeSearchOperator { sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block)); - EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", EdgeSearchProfile.YOLO.equals(profile), false); + EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", SearchOrder.ASCENDING, EdgeSearchProfile.YOLO.equals(profile), false); return performQuery(ctx, new EdgeSearchQuery(specs), true); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index 212d09ab..05fcaa04 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.search; import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.service.SearchOrder; import java.util.Arrays; import java.util.Collections; @@ -8,27 +9,27 @@ import java.util.List; import java.util.stream.Collectors; public enum EdgeSearchProfile { - DEFAULT("default", + DEFAULT("default", SearchOrder.ASCENDING, Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 0, 1), - MODERN("modern", + MODERN("modern", SearchOrder.ASCENDING, Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 2), - CORPO("corpo", + CORPO("corpo", SearchOrder.ASCENDING, Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 4, 5, 6, 7), - YOLO("yolo", + YOLO("yolo", SearchOrder.ASCENDING, Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 0, 2, 1, 3, 4, 6), - CORPO_CLEAN("corpo-clean", + CORPO_CLEAN("corpo-clean", SearchOrder.ASCENDING, Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 4, 5), - ACADEMIA("academia", + ACADEMIA("academia", SearchOrder.ASCENDING, Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 3), @@ -36,15 +37,17 @@ public enum EdgeSearchProfile { public final String name; + public final SearchOrder order; public final List additionalSearchTerm; public final List buckets; public final List indexBlocks; - EdgeSearchProfile(String name, + EdgeSearchProfile(String name, SearchOrder order, List additionalSearchTerm, List indexBlocks, int... buckets) { this.name = name; + this.order = order; this.additionalSearchTerm = additionalSearchTerm; this.indexBlocks = indexBlocks; this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java index 6e341721..60520aa9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java @@ -32,7 +32,7 @@ import java.util.regex.Pattern; public class SiteSearchCommand implements SearchCommandInterface { private final EdgeDataStoreDao dataStoreDao; private final EdgeSearchOperator searchOperator; - private final DomainInformationService domainInformationService; + private DomainInformationService domainInformationService; private final Logger logger = LoggerFactory.getLogger(getClass()); private final MustacheRenderer siteInfoRenderer; @@ -91,7 +91,7 @@ public class SiteSearchCommand implements SearchCommandInterface { logger.info("Fetching Site Info: {}", word); var results = domainInformationService.domainInfo(word) - .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); + .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); logger.debug("Results = {}", results); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java index d94ae487..c5c19187 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java @@ -18,6 +18,7 @@ public class DomainInformation { int pagesIndexed; int incomingLinks; int outboundLinks; + double nominalQuality; double ranking; EdgeDomainIndexingState state; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index 1d77a9d0..d3775dd9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -138,6 +138,7 @@ public class QueryFactory { .subqueries(subqueries) .limitByBucket(50) .limitTotal(100) + .searchOrder(profile.order) .humanQuery(query) .buckets(profile.buckets); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java index 22b24aca..487e1556 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java @@ -107,7 +107,7 @@ public class SearchResultDecorator { private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) { return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength) - + ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0); + + ((details.domainState == EdgeDomainIndexingState.SPECIAL.code) ? 1.25 : 0); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index 2f79a9ea..54179d64 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -1,43 +1,24 @@ package nu.marginalia.wmsa.edge.search.siteinfo; -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.search.model.DomainInformation; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import javax.inject.Inject; import javax.inject.Singleton; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.Optional; -/* - TODO: This class needs to be refactored, a lot of - these SQL queries are redundant and can be - collapsed into one single query that fetches - all the information - */ @Singleton public class DomainInformationService { - private EdgeDataStoreDaoImpl dataStoreDao; - private HikariDataSource dataSource; - private final Logger logger = LoggerFactory.getLogger(getClass()); + private EdgeDataStoreDao dataStore; @Inject - public DomainInformationService( - EdgeDataStoreDaoImpl dataStoreDao, - HikariDataSource dataSource) { - this.dataStoreDao = dataStoreDao; - this.dataSource = dataSource; + public DomainInformationService(EdgeDataStoreDao dataStore) { + this.dataStore = dataStore; } @@ -47,28 +28,29 @@ public class DomainInformationService { if (domainId == null) { return Optional.empty(); } - EdgeDomain domain = dataStoreDao.getDomain(domainId); + EdgeDomain domain = dataStore.getDomain(domainId); - boolean blacklisted = isBlacklisted(domain); - int pagesKnown = getPagesKnown(domainId); - int pagesVisited = getPagesVisited(domainId); - int pagesIndexed = getPagesIndexed(domainId); - int incomingLinks = getIncomingLinks(domainId); - int outboundLinks = getOutboundLinks(domainId); - double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; - EdgeDomainIndexingState state = getDomainState(domainId); - List linkingDomains = getLinkingDomains(domainId); + boolean blacklisted = dataStore.isBlacklisted(domain); + int pagesKnown = dataStore.getPagesKnown(domainId); + int pagesVisited = dataStore.getPagesVisited(domainId); + int pagesIndexed = dataStore.getPagesIndexed(domainId); + int incomingLinks = dataStore.getIncomingLinks(domainId); + int outboundLinks = dataStore.getOutboundLinks(domainId); + double rank = Math.round(10000.0*(1.0-dataStore.getRank(domainId)))/100; + EdgeDomainIndexingState state = dataStore.getDomainState(domainId); + double nominalQuality = Math.round(100*100*Math.exp(dataStore.getDomainQuality(domainId)))/100.; + List linkingDomains = dataStore.getLinkingDomains(domainId); - return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, rank, state, linkingDomains)); + return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains)); } private EdgeId getDomainFromPartial(String site) { try { - return dataStoreDao.getDomainId(new EdgeDomain(site)); + return dataStore.getDomainId(new EdgeDomain(site)); } catch (Exception ex) { try { - return dataStoreDao.getDomainId(new EdgeDomain(site)); + return dataStore.getDomainId(new EdgeDomain(site)); } catch (Exception ex2) { return null; @@ -76,178 +58,4 @@ public class DomainInformationService { } } - - @SneakyThrows - public boolean isBlacklisted(EdgeDomain domain) { - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { - stmt.setString(1, domain.domain); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return true; - } else { - return false; - } - } - } - } - - @SneakyThrows - public int getPagesKnown(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - public int getPagesVisited(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - - @SneakyThrows - public int getPagesIndexed(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - public int getIncomingLinks(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - @SneakyThrows - public int getOutboundLinks(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - public double getDomainQuality(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return -5; - } - } - - public EdgeDomainIndexingState getDomainState(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return EdgeDomainIndexingState.valueOf(rsp.getString(1)); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return EdgeDomainIndexingState.ERROR; - } - - public List getLinkingDomains(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - List results = new ArrayList<>(25); - try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - results.add(new EdgeDomain(rsp.getString(1))); - } - return results; - } catch (Exception ex) { - logger.error("DB error", ex); - } - - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return Collections.emptyList(); - } - - public double getRank(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return 1; - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java index 05c67481..bb946238 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java @@ -3,13 +3,12 @@ package nu.marginalia.wmsa.edge.tools; import com.google.inject.Inject; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; -import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.index.model.RankingSettings; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexDao; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; +import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.mariadb.jdbc.Driver; import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; @@ -60,9 +59,7 @@ public class IndexMergerMain { } var hikari = new DatabaseModule().provideConnection(); - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, domains, new RankingSettings())); + var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings())); var blacklist = new EdgeDomainBlacklistImpl(hikari); new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist); diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 36ab040a..fc9e515d 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -1,11 +1,24 @@ -DROP TABLE IF EXISTS DOMAIN_METADATA; +DROP TABLE IF EXISTS EC_URL_LINK; +DROP VIEW IF EXISTS EC_PAGE_VIEW; + +DROP TABLE IF EXISTS DISC_DOMAIN_TAG; +DROP TABLE IF EXISTS DISC_TAG; +DROP TABLE IF EXISTS DISC_USER; + +DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS; DROP TABLE IF EXISTS EC_FEED_URL; DROP TABLE IF EXISTS EC_DOMAIN_LINK; DROP TABLE IF EXISTS EC_PAGE_DATA; DROP TABLE IF EXISTS EC_URL; -DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS; DROP TABLE IF EXISTS EC_DOMAIN; +DROP TABLE IF EXISTS EC_TOP_DOMAIN; +DROP TABLE IF EXISTS EC_URL_DETAILS; +DROP VIEW IF EXISTS EC_URL_VIEW; +DROP VIEW IF EXISTS EC_URL_PART_HASH; +DROP TABLE IF EXISTS EC_URL_WORD; +DROP TABLE IF EXISTS EC_DICTIONARY; +DROP TABLE IF EXISTS DOMAIN_METADATA; CREATE TABLE IF NOT EXISTS DOMAIN_METADATA ( ID INT PRIMARY KEY, @@ -14,31 +27,52 @@ CREATE TABLE IF NOT EXISTS DOMAIN_METADATA ( GOOD_URLS INT DEFAULT 0 ); +CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN ( + ID INT PRIMARY KEY AUTO_INCREMENT, + URL_PART VARCHAR(255) UNIQUE NOT NULL, + ALIVE BOOLEAN DEFAULT TRUE NOT NULL +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_DOMAIN ( ID INT PRIMARY KEY AUTO_INCREMENT, + URL_PART VARCHAR(255) UNIQUE NOT NULL, + INDEXED INT DEFAULT 0 NOT NULL, + QUALITY DOUBLE DEFAULT -5 NOT NULL, + QUALITY_RAW DOUBLE DEFAULT -5 NOT NULL, + QUALITY_ORIGINAL DOUBLE DEFAULT -5 NOT NULL, - DOMAIN_NAME VARCHAR(255) UNIQUE NOT NULL, - DOMAIN_TOP VARCHAR(255) NOT NULL, - - INDEXED INT DEFAULT 0 NOT NULL COMMENT "~number of documents visited / 100", - STATE ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN') NOT NULL DEFAULT 'active' COMMENT "@see EdgeDomainIndexingState", + URL_TOP_DOMAIN_ID INT NOT NULL, + URL_SUBDOMAIN VARCHAR(255) NOT NULL, + STATE INT DEFAULT 0 NOT NULL, RANK DOUBLE, + DOMAIN_ALIAS INTEGER, - IP VARCHAR(32), INDEX_DATE TIMESTAMP DEFAULT NOW(), DISCOVER_DATE TIMESTAMP DEFAULT NOW(), - IS_ALIVE BOOLEAN AS (STATE='ACTIVE' OR STATE='EXHAUSTED' OR STATE='SPECIAL' OR STATE='SOCIAL_MEDIA') VIRTUAL + FOREIGN KEY (URL_TOP_DOMAIN_ID) REFERENCES EC_TOP_DOMAIN(ID) ON DELETE CASCADE +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS EC_DOMAIN_HISTORY ( + ID INT PRIMARY KEY AUTO_INCREMENT, + URL_PART VARCHAR(255) UNIQUE NOT NULL, + QUALITY_MEASURE DOUBLE DEFAULT -5 NOT NULL, + INBOUND_LINKS INT DEFAULT 1, + LINK_ADJUSTED_QUALITY DOUBLE GENERATED ALWAYS AS (0.3*QUALITY_MEASURE + 0.7*QUALITY_MEASURE / GREATEST(1, INBOUND_LINKS)), + RANK DOUBLE ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_DOMAIN_BLACKLIST ( ID INT PRIMARY KEY AUTO_INCREMENT, - URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL + URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; @@ -47,15 +81,18 @@ CREATE TABLE IF NOT EXISTS EC_URL ( ID INT PRIMARY KEY AUTO_INCREMENT, DOMAIN_ID INT NOT NULL, PROTO ENUM('http','https','gemini') NOT NULL, - PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin, + URL VARCHAR(255) NOT NULL, PORT INT, - PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", VISITED BOOLEAN NOT NULL DEFAULT FALSE, + DATA_HASH INTEGER, + QUALITY_MEASURE DOUBLE, STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', - CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH), + IP VARCHAR(32), + + CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL), FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 @@ -64,15 +101,13 @@ COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( ID INT PRIMARY KEY AUTO_INCREMENT, - TITLE VARCHAR(255) NOT NULL, - DESCRIPTION VARCHAR(255) NOT NULL, + TITLE VARCHAR(255), + DESCRIPTION VARCHAR(255), - WORDS_TOTAL INTEGER NOT NULL, - FORMAT ENUM('PLAIN', 'UNKNOWN', 'HTML123', 'HTML4', 'XHTML', 'HTML5', 'MARKDOWN') NOT NULL, - FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL, - - DATA_HASH INTEGER NOT NULL, - QUALITY DOUBLE NOT NULL, + WORDS_DISTINCT INTEGER, + WORDS_TOTAL INTEGER, + FORMAT VARCHAR(8), + FEATURES INT, FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE ) @@ -80,9 +115,13 @@ CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; CREATE TABLE EC_FEED_URL ( - URL VARCHAR(255) PRIMARY KEY, - DOMAIN_ID INT, + ID INT PRIMARY KEY AUTO_INCREMENT, + DOMAIN_ID INT NOT NULL, + PROTO VARCHAR(8) NOT NULL, + URL VARCHAR(255) NOT NULL, + PORT INT, + CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL), FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 @@ -111,63 +150,92 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK ( FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ); +CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK_AGGREGATE ( + DOMAIN_ID INT PRIMARY KEY NOT NULL, + LINKS INT +); + CREATE OR REPLACE VIEW EC_URL_VIEW AS SELECT - IF(PORT IS NULL, - CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH), - CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH)) - AS URL, - EC_URL.PATH_HASH AS PATH_HASH, - EC_URL.PATH AS PATH, - EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME, - EC_DOMAIN.DOMAIN_TOP AS DOMAIN_TOP, + EC_DOMAIN.URL_PART AS URL_DOMAIN, + EC_URL.URL AS URL_PATH, + EC_TOP_DOMAIN.URL_PART AS URL_TOP, EC_URL.ID AS ID, EC_DOMAIN.ID AS DOMAIN_ID, - + EC_TOP_DOMAIN.ID AS TOP_DOMAIN_ID, + EC_URL.PROTO AS URL_PROTO, + EC_URL.PORT AS URL_PORT, EC_URL.VISITED AS VISITED, - - EC_PAGE_DATA.QUALITY AS QUALITY, - EC_PAGE_DATA.DATA_HASH AS DATA_HASH, + EC_URL.DATA_HASH AS DATA_HASH, + EC_URL.QUALITY_MEASURE AS URL_QUALITY_MEASURE, + EC_DOMAIN.QUALITY AS DOMAIN_QUALITY_MEASURE, + EC_DOMAIN.QUALITY_RAW AS QUALITY_RAW, EC_PAGE_DATA.TITLE AS TITLE, EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION, + EC_URL.IP AS IP, + EC_DOMAIN.STATE AS STATE, EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL, EC_PAGE_DATA.FORMAT AS FORMAT, EC_PAGE_DATA.FEATURES AS FEATURES, - - EC_DOMAIN.IP AS IP, - EC_DOMAIN.STATE AS STATE, EC_DOMAIN.RANK AS RANK, EC_DOMAIN.STATE AS DOMAIN_STATE FROM EC_URL LEFT JOIN EC_PAGE_DATA ON EC_PAGE_DATA.ID = EC_URL.ID INNER JOIN EC_DOMAIN - ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID; + ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID + INNER JOIN EC_TOP_DOMAIN + ON EC_DOMAIN.URL_TOP_DOMAIN_ID=EC_TOP_DOMAIN.ID; + +CREATE OR REPLACE VIEW EC_DISCOVER_TASKS_VIEW AS + SELECT + ID, + URL_PART + FROM EC_DOMAIN + WHERE + DOMAIN_ALIAS IS NULL + AND INDEXED = 0 + ORDER BY QUALITY DESC, ID ASC; CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS SELECT SOURCE_DOMAIN_ID, - SOURCE_DOMAIN.DOMAIN_NAME AS SOURCE_DOMAIN, - SOURCE_DOMAIN.DOMAIN_TOP AS SOURCE_TOP_DOMAIN, + SOURCE_DOMAIN.URL_PART AS SOURCE_URL, + SOURCE_TOP_DOMAIN.URL_PART AS SOURCE_TOP_URL, DEST_DOMAIN_ID, - DEST_DOMAIN.DOMAIN_NAME AS DEST_DOMAIN, - DEST_DOMAIN.DOMAIN_TOP AS DEST_TOP_DOMAIN + DEST_DOMAIN.URL_PART AS DEST_URL, + DEST_TOP_DOMAIN.URL_PART AS DEST_TOP_URL FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID + INNER JOIN EC_TOP_DOMAIN AS SOURCE_TOP_DOMAIN + ON SOURCE_TOP_DOMAIN.ID=SOURCE_DOMAIN.URL_TOP_DOMAIN_ID INNER JOIN EC_DOMAIN AS DEST_DOMAIN ON DEST_DOMAIN.ID=DEST_DOMAIN_ID + INNER JOIN EC_TOP_DOMAIN AS DEST_TOP_DOMAIN + ON DEST_TOP_DOMAIN.ID=DEST_DOMAIN.URL_TOP_DOMAIN_ID ; CREATE OR REPLACE VIEW EC_RELATED_LINKS_IN AS SELECT IN_URL.ID AS SRC_URL_ID, - OUT_URL.ID AS DEST_URL_ID - FROM EC_DOMAIN_LINK - INNER JOIN EC_URL AS IN_URL ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID - INNER JOIN EC_URL AS OUT_URL ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID - WHERE IN_URL.VISITED AND IN_URL.STATE = 'ok' - AND OUT_URL.VISITED AND OUT_URL.STATE = 'ok'; + IN_URL.QUALITY_MEASURE AS SRC_URL_QUALITY, + OUT_URL.ID AS DEST_URL_ID, + OUT_URL.QUALITY_MEASURE AS DEST_URL_QUALITY + FROM EC_URL AS IN_URL + INNER JOIN EC_DOMAIN_LINK + ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID + INNER JOIN EC_URL AS OUT_URL + ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID + WHERE IN_URL.VISITED=TRUE + AND IN_URL.DATA_HASH IS NOT NULL + AND OUT_URL.VISITED=TRUE + AND OUT_URL.DATA_HASH IS NOT NULL; + +CREATE TABLE IF NOT EXISTS EC_DOMAIN_BACKLINKS ( + ID INT PRIMARY KEY, + LINKEDNESS INT +); CREATE TABLE IF NOT EXISTS EC_API_KEY ( LICENSE_KEY VARCHAR(255) UNIQUE, @@ -177,8 +245,16 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY ( RATE INT DEFAULT 10 ); +CREATE INDEX IF NOT EXISTS EC_DOMAIN_RANK_INDEX ON EC_DOMAIN (RANK); +CREATE INDEX IF NOT EXISTS EC_DOMAIN_QUALITY_INDEX ON EC_DOMAIN (QUALITY,STATE); + CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED); -CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP); +CREATE INDEX IF NOT EXISTS EC_DOMAIN_ID_INDEXED_INDEX ON EC_DOMAIN (ID, INDEXED); +CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, INDEXED, QUALITY); + +CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED); +CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE); +CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP); ---; diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb index cd8abf67..5696b251 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb @@ -10,4 +10,5 @@ Pages Known: {{pagesKnown}} Pages Indexed: {{pagesKnown}} Inbound Links: {{inboundLinks}} Outbound Links: {{outboundLinks}} +Nominal Quality: {{nominalQuality}}% Crawl Ranking: {{ranking}}% \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb index 837f320d..19b585b8 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb @@ -37,6 +37,7 @@

Links

+ Nominal Quality: {{nominalQuality}}%
Crawl Ranking: {{ranking}}%
Incoming Links: {{incomingLinks}}
Outbound Links: {{outboundLinks}}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java index 26d397a8..84b9f165 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java @@ -43,7 +43,7 @@ public class TestUtil { logger.info("Running script {}", scriptFile); try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile); var stmt = conn.createStatement()) { - for (String s : new String(scriptStream.readAllBytes()).split("(;|---)")) { + for (String s : new String(scriptStream.readAllBytes()).split(";")) { if (!s.isBlank()) { try { Assertions.assertTrue(stmt.executeUpdate(s) >= 0); diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java index 875cda37..1915d989 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java @@ -90,10 +90,10 @@ class BTreeWriterTest { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { + writer.write(0, toPut.size(), (offset) -> { for (int i = 0; i < data.length; i++) { - slice.put(2L*i, data[i]); - slice.put( 2L*i + 1, i); + mmf.put(offset + 2L*i, data[i]); + mmf.put(offset + 2L*i + 1, i); } }); mmf.force(); @@ -133,10 +133,10 @@ class BTreeWriterTest { { var writer = new BTreeWriter(mmf, ctx); - writer.write( 0, toPut.size(), (slice) -> { + writer.write( 0, toPut.size(), (offset) -> { for (int i = 0; i < data.length; i++) { - slice.put(2L*i, data[i]); - slice.put(2L*i + 1, i); + mmf.put(offset + 2L*i, data[i]); + mmf.put(offset + 2L*i + 1, i); } }); mmf.force(); @@ -182,9 +182,9 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { + writer.write(0, toPut.size(), (offset) -> { for (int i = 0; i < data.length; i++) { - slice.put(i, data[i]); + mmf.put(offset + i, data[i]); } }); mmf.force(); @@ -235,9 +235,9 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { + writer.write(0, toPut.size(), (offset) -> { for (int i = 0; i < data.length; i++) { - slice.put(i, data[i]); + mmf.put(offset + i, data[i]); } }); mmf.force(); @@ -288,10 +288,10 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { + writer.write(0, toPut.size(), (offset) -> { for (int i = 0; i < data.length; i++) { - slice.put(i*2L, data[i]); - slice.put(i*2L+1, i); + mmf.put(offset + i*2L, data[i]); + mmf.put(offset + i*2L+1, i); } }); mmf.force(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java index 9331a998..326c9b15 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java @@ -27,7 +27,7 @@ class LongPairHashMapTest { try { RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); - var lphm = LongPairHashMap.createNew(mmf, 1024); + var lphm = new LongPairHashMap(mmf, 1024); toPut.forEach(i -> { lphm.put(new LongPairHashMap.CellData(i, i)); }); @@ -36,7 +36,7 @@ class LongPairHashMapTest { RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw"); MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); - var lphm2 = LongPairHashMap.loadExisting(mmf2); + var lphm2 = new LongPairHashMap(mmf2); toPut.forEach(i -> { Assertions.assertTrue(lphm2.get(i).isSet()); Assertions.assertEquals(i, (int) lphm2.get(i).getKey()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java deleted file mode 100644 index d839bbb2..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java +++ /dev/null @@ -1,48 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -@Testcontainers -class SqlLoadDomainLinksTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("sql/edge-crawler-cache.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - @BeforeEach - public void setUp() { - dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - - var loadDomains = new SqlLoadDomains(dataSource); - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); - } - - @AfterEach - public void tearDown() { - dataSource.close(); - } - - @Test - public void loadDomainLinks() { - var loader = new SqlLoadDomainLinks(dataSource); - loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) }); - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java deleted file mode 100644 index 25dd18b4..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java +++ /dev/null @@ -1,52 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.loader; - -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import static org.junit.jupiter.api.Assertions.*; - -@Testcontainers -class SqlLoadDomainsTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("sql/edge-crawler-cache.sql") - .withNetworkAliases("mariadb"); - - @Test - public void loadDomain() { - - try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { - var loadDomains = new SqlLoadDomains(dataSource); - var loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); - - assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0); - } - - } - - @Test - public void loadDomains() { - - try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { - var loadDomains = new SqlLoadDomains(dataSource); - var loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); - - assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0); - assertTrue(loaderData.getDomainId(new EdgeDomain("memex.marginalia.nu")) >= 0); - } - - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java deleted file mode 100644 index ecb0e88a..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java +++ /dev/null @@ -1,94 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; -import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.net.URISyntaxException; -import java.util.List; -import java.util.Set; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -@Testcontainers -class SqlLoadProcessedDocumentTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("sql/edge-crawler-cache.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - EdgeDataStoreDaoImpl dataStoreDao; - - @BeforeEach - public void setUp() throws URISyntaxException { - dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - dataStoreDao = new EdgeDataStoreDaoImpl(dataSource); - - var loadDomains = new SqlLoadDomains(dataSource); - var loadUrls = new SqlLoadUrls(dataSource); - - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); - - loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")}); - - } - - @AfterEach - public void tearDown() { - dataStoreDao.clearCaches(); - dataSource.close(); - } - - @Test - public void loadProcessedDocument() throws URISyntaxException { - var loader = new SqlLoadProcessedDocument(dataSource); - var url = new EdgeUrl("https://www.marginalia.nu/"); - - loader.load(loaderData, List.of(new LoadProcessedDocument( - url, - EdgeUrlState.OK, - "TITLE", - "DESCR", - HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)), - EdgeHtmlStandard.HTML5, - 100, - 12345, - -3.14 - ))); - - var details = dataStoreDao.getUrlDetailsMulti(List.of(new EdgeId<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/"))))); - assertEquals(1, details.size()); - - var urlDetails = details.get(0); - - assertEquals("TITLE", urlDetails.getTitle()); - assertEquals("DESCR", urlDetails.getDescription()); - assertTrue(urlDetails.isAffiliate()); - assertEquals(100, urlDetails.words); - assertEquals(12345, urlDetails.dataHash); - assertEquals(-3.14, urlDetails.getUrlQuality()); - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java deleted file mode 100644 index eb66da92..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java +++ /dev/null @@ -1,54 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -@Testcontainers -class SqlLoadProcessedDomainTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("sql/edge-crawler-cache.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - @BeforeEach - public void setUp() { - - dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - - var loadDomains = new SqlLoadDomains(dataSource); - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain[]{ new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); - } - - @AfterEach - public void tearDown() { - dataSource.close(); - } - - @Test - public void loadProcessedDomain() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); - loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1"); - } - @Test - public void loadDomainAlias() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); - loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu"))); - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java deleted file mode 100644 index 5afac733..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java +++ /dev/null @@ -1,50 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.net.URISyntaxException; - -@Testcontainers -class SqlLoadUrlsTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("sql/edge-crawler-cache.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - @BeforeEach - public void setUp() { - dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - - var loadDomains = new SqlLoadDomains(dataSource); - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); - } - - @AfterEach - public void tearDown() { - dataSource.close(); - } - - @Test - public void loadUrl() throws URISyntaxException { - var loadUrls = new SqlLoadUrls(dataSource); - loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") }); - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java index 961d8304..180576fc 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java @@ -1,11 +1,11 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; +import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java index 2b2da0fd..6b029da9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java @@ -3,14 +3,14 @@ package nu.marginalia.wmsa.edge.index.service; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.TestUtil; +import nu.marginalia.wmsa.client.exception.RemoteException; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.EdgeIndexService; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.EdgeId; @@ -23,6 +23,7 @@ import org.junit.jupiter.api.parallel.ResourceAccessMode; import org.junit.jupiter.api.parallel.ResourceLock; import spark.Spark; +import java.io.File; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; @@ -30,6 +31,7 @@ import java.util.List; import java.util.stream.Collectors; import static nu.marginalia.util.TestUtil.getConnection; +import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java new file mode 100644 index 00000000..f42f2d36 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java @@ -0,0 +1,89 @@ +package nu.marginalia.wmsa.edge.index.service; + +import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; + +class SearchIndexConverterTest { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @Test @Disabled @SneakyThrows + public void test() { + // File dictFile = new File("/home/vlofgren/dictionary.dat"); + File inFile = new File("/home/vlofgren/Work/converter/3/page-index.dat"); + + new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"), inFile, + new File("/home/vlofgren/Work/converter/words.dat"), + new File("/home/vlofgren/Work/converter/urls.dat"), new SearchIndexPartitioner(null), val -> false); + + // sanityCheck(); + } + + @Test @Disabled + public void sanityCheck() { + File inFile = new File("/home/vlofgren/write/6/page-index.dat"); + +// SearchIndexReader sir = new SearchIndexReader(new SearchIndex[]{ +// new SearchIndex("body", Path.of("/tmp"), +// new File("/home/vlofgren/data/urls.dat"), +// new File("/home/vlofgren/data/words.dat")), +// new SearchIndex("body", Path.of("/tmp"), +// new File("/home/vlofgren/data/urls.dat"), +// new File("/home/vlofgren/data/words.dat")) +// , +// new SearchIndex("body", Path.of("/tmp"), +// new File("/home/vlofgren/data/urls.dat"), +// new File("/home/vlofgren/data/words.dat")) +// , +// new SearchIndex("body", Path.of("/tmp"), +// new File("/home/vlofgren/data/urls.dat"), +// new File("/home/vlofgren/data/words.dat")) +// }); + +// getQuery(sir, new EdgeIndexSearchTerms(List.of(152, 106), Collections.emptyList())).stream().forEach(System.out::println); +// sir.findWord(152).also(106).stream().forEach(System.out::println); +// scanFile(inFile, (url, word) -> { +// //System.out.println(url + " " + word); +// if (!sir.findWord(word).stream().anyMatch(url::equals)) { +// logger.error("Can't find word {} in {}", word, url); +// } +// }); + + + } +/* + private SearchIndexReader.Query getQuery(SearchIndexReader indexReader, EdgeIndexSearchTerms searchTerms) { + var orderedIncludes = searchTerms.includes + .stream() + .sorted(Comparator.comparingLong(indexReader::numHits)) + .distinct() + .mapToInt(Integer::intValue) + .toArray(); + + logger.info("Includes: ({}); excludes: ({})", Arrays. + stream(orderedIncludes) + .mapToObj(String::valueOf) + .collect(Collectors.joining(",")), + searchTerms.excludes.stream().map(String::valueOf).collect(Collectors.joining(","))); + SearchIndexReader.Query query = indexReader.findWord(orderedIncludes[0]); + for (int i = 1; i < orderedIncludes.length; i++) { + query = query.also(orderedIncludes[i]); + } + for (int term : searchTerms.excludes) { + query = query.not(term); + } + return query; + } + +*/ +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java index edcfa71f..5f1d2a0f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java @@ -1,14 +1,14 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.model.EdgeId; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java index e780ed62..ee84472e 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java @@ -1,6 +1,6 @@ package nu.marginalia.wmsa.edge.index.service; -import nu.marginalia.wmsa.edge.index.dictionary.TokenCompressor; +import nu.marginalia.wmsa.edge.index.service.dictionary.TokenCompressor; import org.junit.jupiter.api.Test; import java.util.Arrays; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index 65b1ad57..4aa9bceb 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -13,7 +13,6 @@ class QueryVariantsTest { QueryVariants variants; QueryParser parser; SentenceExtractor se; - @BeforeEach public void setUp() { LanguageModels lm = TestLanguageModels.getLanguageModels(); @@ -25,7 +24,7 @@ class QueryVariantsTest { parser = new QueryParser(new EnglishDictionary(dict), variants); } - @Test @SuppressWarnings("unchecked") + @Test void getQueryVariants() { System.out.println(se.extractSentence("we are alone")); testCase("DOS", List.of("DOS")); @@ -51,5 +50,7 @@ class QueryVariantsTest { private void testCase(String input, List... expected) { var tokens = variants.getQueryVariants(parser.extractBasicTokens(input)); System.out.println(tokens); +// var result = tokens.stream().map(lst -> lst.terms).collect(Collectors.toSet()); +// assertEquals(Set.of(expected), result, "Case failed: " + input); } } \ No newline at end of file From 5ef953ae3dd797179d582db0c32d68f9bdef8fe3 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 16 Jun 2022 14:01:49 +0200 Subject: [PATCH 26/27] Fixing typo on front page. --- marginalia_nu/src/main/resources/static/edge/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/resources/static/edge/index.html b/marginalia_nu/src/main/resources/static/edge/index.html index 166e67b8..47d6e314 100644 --- a/marginalia_nu/src/main/resources/static/edge/index.html +++ b/marginalia_nu/src/main/resources/static/edge/index.html @@ -88,7 +88,7 @@ theology, the occult, knitting, - compter science, + computer science, or art.

From 93c274f1d4e17a26969b05df50807235a2b27c5e Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 8 Jul 2022 12:34:05 +0200 Subject: [PATCH 27/27] E2E-test for memex --- .../nu/marginalia/wmsa/edge/E2ETestBase.java | 16 ++ .../nu/marginalia/wmsa/edge/MemexE2ETest.java | 95 ++++++++++ marginalia_nu/src/e2e/resources/init.sh | 1 + marginalia_nu/src/e2e/resources/memex.sh | 39 +++++ .../src/e2e/resources/memex/index.gmi | 6 + .../src/e2e/resources/memex/log/a.gmi | 7 + .../src/e2e/resources/memex/log/b.gmi | 6 + .../src/e2e/resources/memex/log/index.gmi | 7 + .../src/e2e/resources/nginx/memex.conf | 27 +++ .../nu/marginalia/gemini/GeminiService.java | 163 +---------------- .../marginalia/gemini/GeminiServiceDummy.java | 10 ++ .../marginalia/gemini/GeminiServiceImpl.java | 164 ++++++++++++++++++ .../gemini/plugins/BareStaticPagePlugin.java | 7 +- .../nu/marginalia/wmsa/auth/AuthService.java | 23 ++- .../wmsa/configuration/ServiceDescriptor.java | 5 +- .../configuration/command/StartCommand.java | 1 - .../wmsa/configuration/server/Service.java | 48 +++-- .../java/nu/marginalia/wmsa/memex/Memex.java | 4 +- .../wmsa/memex/MemexConfigurationModule.java | 44 ++++- .../nu/marginalia/wmsa/memex/MemexMain.java | 2 +- .../marginalia/wmsa/memex/MemexService.java | 17 +- .../wmsa/memex/client/MemexApiClient.java | 2 +- .../memex/system/MemexSourceFileSystem.java | 6 +- .../wmsa/memex/system/git/MemexGitRepo.java | 15 ++ .../memex/system/git/MemexGitRepoDummy.java | 36 ++++ .../MemexGitRepoImpl.java} | 16 +- .../wmsa/memex/change/GemtextChangeTest.java | 14 +- .../memex/change/GemtextTaskUpdateTest.java | 14 +- .../GemtextTombstoneUpdateCaclulatorTest.java | 14 +- 29 files changed, 579 insertions(+), 230 deletions(-) create mode 100644 marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/MemexE2ETest.java create mode 100644 marginalia_nu/src/e2e/resources/memex.sh create mode 100644 marginalia_nu/src/e2e/resources/memex/index.gmi create mode 100644 marginalia_nu/src/e2e/resources/memex/log/a.gmi create mode 100644 marginalia_nu/src/e2e/resources/memex/log/b.gmi create mode 100644 marginalia_nu/src/e2e/resources/memex/log/index.gmi create mode 100644 marginalia_nu/src/e2e/resources/nginx/memex.conf create mode 100644 marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceDummy.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceImpl.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepo.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoDummy.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/{MemexGitRepo.java => git/MemexGitRepoImpl.java} (90%) diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java index 0c329a79..da40a7fc 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java @@ -43,6 +43,22 @@ public abstract class E2ETestBase { .withReadTimeout(Duration.ofSeconds(15))) ; } + public static GenericContainer forService(ServiceDescriptor service, GenericContainer mariaDB, String setupScript) { + return new GenericContainer<>("openjdk:17-alpine") + .dependsOn(mariaDB) + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withCopyFileToContainer(MountableFile.forClasspathResource(setupScript), "/" + setupScript) + .withExposedPorts(service.port) + .withFileSystemBind(modelsPath(), "/wmsa/model", BindMode.READ_ONLY) + .withNetwork(network) + .withNetworkAliases(service.name) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name))) + .withCommand("sh", setupScript, service.name) + .waitingFor(Wait.forHttp("/internal/ping") + .forPort(service.port) + .withReadTimeout(Duration.ofSeconds(15))) + ; + } public static MountableFile jarFile() { Path cwd = Path.of(System.getProperty("user.dir")); diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/MemexE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/MemexE2ETest.java new file mode 100644 index 00000000..7410b3b3 --- /dev/null +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/MemexE2ETest.java @@ -0,0 +1,95 @@ +package nu.marginalia.wmsa.edge; + + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import okhttp3.OkHttpClient; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.mariadb.jdbc.Driver; +import org.openqa.selenium.OutputType; +import org.openqa.selenium.chrome.ChromeOptions; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.*; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.utility.MountableFile; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.LocalDateTime; +import java.util.concurrent.TimeUnit; + +import static nu.marginalia.wmsa.configuration.ServiceDescriptor.AUTH; +import static nu.marginalia.wmsa.configuration.ServiceDescriptor.MEMEX; + +@Tag("e2e") +@Testcontainers +public class MemexE2ETest extends E2ETestBase { + @Container + public MariaDBContainer mariaDB = getMariaDBContainer(); + + @Container + public GenericContainer auth = forService(AUTH, mariaDB); + + @Container + public GenericContainer memexContainer = forService(MEMEX, mariaDB, "memex.sh") + .withClasspathResourceMapping("/memex", "/memex", BindMode.READ_ONLY); + + @Container + public NginxContainer proxyNginx = new NginxContainer<>("nginx:stable") + .dependsOn(auth) + .dependsOn(memexContainer) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx"))) + .withCopyFileToContainer(MountableFile.forClasspathResource("nginx/memex.conf"), "/etc/nginx/conf.d/default.conf") + .withNetwork(network) + .withNetworkAliases("proxyNginx"); + + @Container + public BrowserWebDriverContainer chrome = new BrowserWebDriverContainer<>() + .withNetwork(network) + .withCapabilities(new ChromeOptions()); + + private Gson gson = new GsonBuilder().create(); + private OkHttpClient httpClient = new OkHttpClient.Builder() + .connectTimeout(100, TimeUnit.MILLISECONDS) + .readTimeout(6000, TimeUnit.SECONDS) + .retryOnConnectionFailure(true) + .followRedirects(true) + .build(); + + @Test + public void run() throws IOException, InterruptedException { + Thread.sleep(10_000); + new Driver(); + + var driver = chrome.getWebDriver(); + + driver.get("http://proxyNginx/"); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage")); + + driver.get("http://proxyNginx/log/"); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("log")); + + driver.get("http://proxyNginx/log/a.gmi"); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("log-a.gmi")); + + driver.get("http://proxyNginx/log/b.gmi"); + Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("log-b.gmi")); + } + + private static Path screenshotFilename(String operation) throws IOException { + var path = Path.of(System.getProperty("user.dir")).resolve("build/test/e2e/"); + Files.createDirectories(path); + + String name = String.format("test-%s-%s.png", operation, LocalDateTime.now()); + path = path.resolve(name); + + System.out.println("Screenshot in " + path); + return path; + } + + +} diff --git a/marginalia_nu/src/e2e/resources/init.sh b/marginalia_nu/src/e2e/resources/init.sh index 5409f787..2f9fa103 100644 --- a/marginalia_nu/src/e2e/resources/init.sh +++ b/marginalia_nu/src/e2e/resources/init.sh @@ -69,4 +69,5 @@ memex memex dating dating EOF +echo "*** Starting $1" WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file diff --git a/marginalia_nu/src/e2e/resources/memex.sh b/marginalia_nu/src/e2e/resources/memex.sh new file mode 100644 index 00000000..6ce801b5 --- /dev/null +++ b/marginalia_nu/src/e2e/resources/memex.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +HOME=/wmsa + +mkdir -p ${HOME}/conf + +cat > ${HOME}/conf/db.properties < ${HOME}/conf/hosts < serve(connection)); - } - } - } - catch (IOException ex) { - logger.error("IO Exception in gemini server", ex); - } - } - - private void serve(SSLSocket socket) { - final GeminiConnection connection; - try { - connection = new GeminiConnection(socket); - } - catch (IOException ex) { - logger.error("Failed to create connection object", ex); - return; - } - - try { - handleRequest(connection); - } - catch (GeminiUserException ex) { - errorResponse(connection, ex.getMessage()); - } - catch (SSLException ex) { - logger.error(connection.getAddress() + " SSL error"); - connection.close(); - } - catch (Exception ex) { - errorResponse(connection, "Error"); - logger.error(connection.getAddress(), ex); - } - finally { - connection.close(); - } - } - - private void errorResponse(GeminiConnection connection, String message) { - if (connection.isConnected()) { - try { - logger.error("=> " + connection.getAddress(), message); - connection.writeStatusLine(GeminiStatusCode.ERROR_PERMANENT, message); - } - catch (IOException ex) { - logger.error("Exception while sending error", ex); - } - } - } - - private void handleRequest(GeminiConnection connection) throws Exception { - - final String address = connection.getAddress(); - logger.info("Connect: " + address); - - final Optional maybeUri = connection.readUrl(); - if (maybeUri.isEmpty()) { - logger.info("Done: {}", address); - return; - } - - final URI uri = maybeUri.get(); - logger.info("Request {}", uri); - - if (!uri.getScheme().equals("gemini")) { - throw new GeminiUserException("Unsupported protocol"); - } - - servePage(connection, uri); - logger.info("Done: {}", address); - } - - private void servePage(GeminiConnection connection, URI url) throws IOException { - String path = url.getPath(); - - for (Plugin p : plugins) { - if (p.serve(url, connection)) { - return; - } - } - - logger.error("FileNotFound {}", path); - connection.writeStatusLine(GeminiStatusCode.ERROR_TEMPORARY, "No such file"); - } - +public interface GeminiService { + String DEFAULT_FILENAME = "index.gmi"; + void run(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceDummy.java b/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceDummy.java new file mode 100644 index 00000000..81586f31 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceDummy.java @@ -0,0 +1,10 @@ +package nu.marginalia.gemini; + +import com.google.inject.Singleton; + +@Singleton +public class GeminiServiceDummy implements GeminiService { + @Override + public void run() { + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceImpl.java b/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceImpl.java new file mode 100644 index 00000000..0381be48 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceImpl.java @@ -0,0 +1,164 @@ +package nu.marginalia.gemini; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.google.inject.name.Named; +import nu.marginalia.gemini.io.GeminiConnection; +import nu.marginalia.gemini.io.GeminiSSLSetUp; +import nu.marginalia.gemini.io.GeminiStatusCode; +import nu.marginalia.gemini.io.GeminiUserException; +import nu.marginalia.gemini.plugins.BareStaticPagePlugin; +import nu.marginalia.gemini.plugins.Plugin; +import nu.marginalia.gemini.plugins.SearchPlugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.net.ssl.SSLException; +import javax.net.ssl.SSLServerSocket; +import javax.net.ssl.SSLServerSocketFactory; +import javax.net.ssl.SSLSocket; +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; + +@Singleton +public class GeminiServiceImpl implements GeminiService { + + public final Path serverRoot; + + private final Logger logger = LoggerFactory.getLogger(getClass().getSimpleName()); + private final Executor pool = Executors.newFixedThreadPool(32); + private final SSLServerSocket serverSocket; + + private final Plugin[] plugins; + private final BadBotList badBotList = BadBotList.INSTANCE; + + @Inject + public GeminiServiceImpl(@Named("gemini-server-root") Path serverRoot, + @Named("gemini-server-port") Integer port, + GeminiSSLSetUp sslSetUp, + BareStaticPagePlugin pagePlugin, + SearchPlugin searchPlugin) throws Exception { + this.serverRoot = serverRoot; + logger.info("Setting up crypto"); + final SSLServerSocketFactory socketFactory = sslSetUp.getServerSocketFactory(); + + serverSocket = (SSLServerSocket) socketFactory.createServerSocket(port /* 1965 */); + serverSocket.setEnabledCipherSuites(socketFactory.getSupportedCipherSuites()); + serverSocket.setEnabledProtocols(new String[] {"TLSv1.3", "TLSv1.2"}); + + logger.info("Verifying setup"); + if (!Files.exists(this.serverRoot)) { + logger.error("Could not find SERVER_ROOT {}", this.serverRoot); + System.exit(255); + } + + plugins = new Plugin[] { + pagePlugin, + searchPlugin + }; + } + + @Override + public void run() { + logger.info("Awaiting connections"); + + try { + for (;;) { + SSLSocket connection = (SSLSocket) serverSocket.accept(); + connection.setSoTimeout(10_000); + + if (!badBotList.isAllowed(connection.getInetAddress())) { + connection.close(); + } else { + pool.execute(() -> serve(connection)); + } + } + } + catch (IOException ex) { + logger.error("IO Exception in gemini server", ex); + } + } + + private void serve(SSLSocket socket) { + final GeminiConnection connection; + try { + connection = new GeminiConnection(socket); + } + catch (IOException ex) { + logger.error("Failed to create connection object", ex); + return; + } + + try { + handleRequest(connection); + } + catch (GeminiUserException ex) { + errorResponse(connection, ex.getMessage()); + } + catch (SSLException ex) { + logger.error(connection.getAddress() + " SSL error"); + connection.close(); + } + catch (Exception ex) { + errorResponse(connection, "Error"); + logger.error(connection.getAddress(), ex); + } + finally { + connection.close(); + } + } + + private void errorResponse(GeminiConnection connection, String message) { + if (connection.isConnected()) { + try { + logger.error("=> " + connection.getAddress(), message); + connection.writeStatusLine(GeminiStatusCode.ERROR_PERMANENT, message); + } + catch (IOException ex) { + logger.error("Exception while sending error", ex); + } + } + } + + private void handleRequest(GeminiConnection connection) throws Exception { + + final String address = connection.getAddress(); + logger.info("Connect: " + address); + + final Optional maybeUri = connection.readUrl(); + if (maybeUri.isEmpty()) { + logger.info("Done: {}", address); + return; + } + + final URI uri = maybeUri.get(); + logger.info("Request {}", uri); + + if (!uri.getScheme().equals("gemini")) { + throw new GeminiUserException("Unsupported protocol"); + } + + servePage(connection, uri); + logger.info("Done: {}", address); + } + + private void servePage(GeminiConnection connection, URI url) throws IOException { + String path = url.getPath(); + + for (Plugin p : plugins) { + if (p.serve(url, connection)) { + return; + } + } + + logger.error("FileNotFound {}", path); + connection.writeStatusLine(GeminiStatusCode.ERROR_TEMPORARY, "No such file"); + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/BareStaticPagePlugin.java b/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/BareStaticPagePlugin.java index fbfb502b..46bdfb7d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/BareStaticPagePlugin.java +++ b/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/BareStaticPagePlugin.java @@ -2,6 +2,7 @@ package nu.marginalia.gemini.plugins; import com.google.inject.Inject; import com.google.inject.name.Named; +import nu.marginalia.gemini.GeminiService; import nu.marginalia.gemini.io.GeminiConnection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -11,8 +12,6 @@ import java.net.URI; import java.nio.file.Files; import java.nio.file.Path; -import static nu.marginalia.gemini.GeminiService.DEFAULT_FILENAME; - public class BareStaticPagePlugin implements Plugin { private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -43,8 +42,8 @@ public class BareStaticPagePlugin implements Plugin { private Path getServerPath(String requestPath) { final Path serverPath = Path.of(geminiServerRoot + requestPath); - if (Files.isDirectory(serverPath) && Files.isRegularFile(serverPath.resolve(DEFAULT_FILENAME))) { - return serverPath.resolve(DEFAULT_FILENAME); + if (Files.isDirectory(serverPath) && Files.isRegularFile(serverPath.resolve(GeminiService.DEFAULT_FILENAME))) { + return serverPath.resolve(GeminiService.DEFAULT_FILENAME); } return serverPath; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthService.java index 60c22b9b..4c93db95 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthService.java @@ -1,6 +1,5 @@ package nu.marginalia.wmsa.auth; -import com.github.jknack.handlebars.internal.Files; import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.wmsa.auth.model.LoginFormModel; @@ -14,11 +13,12 @@ import spark.Request; import spark.Response; import spark.Spark; -import java.io.FileReader; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.util.Objects; import java.util.Optional; +import java.util.UUID; import static spark.Spark.*; @@ -40,11 +40,8 @@ public class AuthService extends Service { super(ip, port, initialization, metricsServer); - try (var is = new FileReader(topSecretPasswordFile.toFile())) { - password = Files.read(is); - } catch (IOException e) { - logger.error("Could not read password from file " + topSecretPasswordFile, e); - } + password = initPassword(topSecretPasswordFile); + loginFormRenderer = rendererFactory.renderer("auth/login"); Spark.path("public/api", () -> { @@ -60,6 +57,18 @@ public class AuthService extends Service { }); } + private String initPassword(Path topSecretPasswordFile) { + if (Files.exists(topSecretPasswordFile)) { + try { + return Files.readString(topSecretPasswordFile); + } catch (IOException e) { + logger.error("Could not read password from file " + topSecretPasswordFile, e); + } + } + logger.error("Setting random password"); + return UUID.randomUUID().toString(); + } + private Object loginForm(Request request, Response response) { String redir = Objects.requireNonNull(request.queryParams("redirect")); String service = Objects.requireNonNull(request.queryParams("service")); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java index e0aff247..c0f7dde2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java @@ -1,7 +1,7 @@ package nu.marginalia.wmsa.configuration; -import nu.marginalia.wmsa.auth.AuthMain; import nu.marginalia.wmsa.api.ApiMain; +import nu.marginalia.wmsa.auth.AuthMain; import nu.marginalia.wmsa.configuration.command.Command; import nu.marginalia.wmsa.configuration.command.ListCommand; import nu.marginalia.wmsa.configuration.command.StartCommand; @@ -35,7 +35,7 @@ public enum ServiceDescriptor { EDGE_SEARCH("edge-search", 5023, EdgeSearchMain.class), EDGE_ASSISTANT("edge-assistant", 5025, EdgeAssistantMain.class), - EDGE_MEMEX("memex", 5030, MemexMain.class), + MEMEX("memex", 5030, MemexMain.class), ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class), @@ -79,7 +79,6 @@ public enum ServiceDescriptor { } public static void main(String... args) { - MainMapLookup.setMainArguments(args); Map functions = Stream.of(new ListCommand(), new StartCommand(), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/StartCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/StartCommand.java index 55d46813..cb63d749 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/StartCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/StartCommand.java @@ -16,7 +16,6 @@ public class StartCommand extends Command { System.err.println("Usage: start service-descriptor"); System.exit(255); } - var mainMethod = getKind(args[1]).mainClass.getMethod("main", String[].class); String[] args2 = Arrays.copyOfRange(args, 2, args.length); mainMethod.invoke(null, (Object) args2); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Service.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Service.java index c9f618da..9674611f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Service.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Service.java @@ -37,7 +37,7 @@ public class Service { private static volatile boolean initialized = false; - public Service(String ip, int port, Initialization initialization, MetricsServer metricsServer) { + public Service(String ip, int port, Initialization initialization, MetricsServer metricsServer, Runnable configureStaticFiles) { this.initialization = initialization; serviceName = System.getProperty("service-name"); @@ -51,8 +51,7 @@ public class Service { logger.info("{} Listening to {}:{}", getClass().getSimpleName(), ip == null ? "" : ip, port); - Spark.staticFiles.expireTime(3600); - Spark.staticFiles.header("Cache-control", "public"); + configureStaticFiles.run(); Spark.before(this::filterPublicRequests); Spark.before(this::auditRequestIn); @@ -66,24 +65,35 @@ public class Service { } } + public Service(String ip, int port, Initialization initialization, MetricsServer metricsServer) { + this(ip, port, initialization, metricsServer, () -> { + // configureStaticFiles can't be an overridable method in Service because it may + // need to depend on parameters to the constructor, and super-constructors + // must run first + Spark.staticFiles.expireTime(3600); + Spark.staticFiles.header("Cache-control", "public"); + }); + } + private void filterPublicRequests(Request request, Response response) { - if (null != request.headers("X-Public")) { - - String context = Optional - .ofNullable(request.headers("X-Context")) - .orElseGet(request::ip); - - if (!request.pathInfo().startsWith("/public/")) { - logger.warn(httpMarker, "External connection to internal API: {} -> {} {}", context, request.requestMethod(), request.pathInfo()); - Spark.halt(HttpStatus.SC_FORBIDDEN); - } - - String url = request.pathInfo(); - if (request.queryString() != null) { - url = url + "?" + request.queryString(); - } - logger.info(httpMarker, "PUBLIC {}: {} {}", Context.fromRequest(request).getIpHash().orElse("?"), request.requestMethod(), url); + if (null == request.headers("X-Public")) { + return; } + + String context = Optional + .ofNullable(request.headers("X-Context")) + .orElseGet(request::ip); + + if (!request.pathInfo().startsWith("/public/")) { + logger.warn(httpMarker, "External connection to internal API: {} -> {} {}", context, request.requestMethod(), request.pathInfo()); + Spark.halt(HttpStatus.SC_FORBIDDEN); + } + + String url = request.pathInfo(); + if (request.queryString() != null) { + url = url + "?" + request.queryString(); + } + logger.info(httpMarker, "PUBLIC {}: {} {}", Context.fromRequest(request).getIpHash().orElse("?"), request.requestMethod(), url); } private Object isInitialized(Request request, Response response) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/Memex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/Memex.java index 5b5ac2f7..febdc5af 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/Memex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/Memex.java @@ -6,9 +6,9 @@ import com.google.inject.name.Named; import io.reactivex.rxjava3.schedulers.Schedulers; import nu.marginalia.gemini.GeminiService; import nu.marginalia.gemini.gmi.GemtextDatabase; +import nu.marginalia.gemini.gmi.GemtextDocument; import nu.marginalia.util.graphics.dithering.FloydSteinbergDither; import nu.marginalia.util.graphics.dithering.Palettes; -import nu.marginalia.gemini.gmi.GemtextDocument; import nu.marginalia.wmsa.memex.change.GemtextTombstoneUpdateCaclulator; import nu.marginalia.wmsa.memex.model.MemexImage; import nu.marginalia.wmsa.memex.model.MemexNode; @@ -16,7 +16,7 @@ import nu.marginalia.wmsa.memex.model.MemexNodeUrl; import nu.marginalia.wmsa.memex.renderer.MemexRendererers; import nu.marginalia.wmsa.memex.system.MemexFileSystemMonitor; import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.memex.system.MemexGitRepo; +import nu.marginalia.wmsa.memex.system.git.MemexGitRepo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexConfigurationModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexConfigurationModule.java index 676ebc05..2533a9d1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexConfigurationModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexConfigurationModule.java @@ -5,23 +5,59 @@ import com.google.inject.Inject; import com.google.inject.Provider; import com.google.inject.name.Named; import com.google.inject.name.Names; +import lombok.SneakyThrows; +import nu.marginalia.gemini.GeminiService; +import nu.marginalia.gemini.GeminiServiceDummy; +import nu.marginalia.gemini.GeminiServiceImpl; import nu.marginalia.wmsa.memex.system.MemexFileWriter; +import nu.marginalia.wmsa.memex.system.git.MemexGitRepo; +import nu.marginalia.wmsa.memex.system.git.MemexGitRepoDummy; +import nu.marginalia.wmsa.memex.system.git.MemexGitRepoImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.nio.file.Path; public class MemexConfigurationModule extends AbstractModule { + private static final Logger logger = LoggerFactory.getLogger(MemexConfigurationModule.class); + + private static final String MEMEX_ROOT_PROPERTY = System.getProperty("memex-root", "/var/lib/wmsa/memex"); + private static final String MEMEX_HTML_PROPERTY = System.getProperty("memex-html-resources", "/var/lib/wmsa/memex-html"); + private static final String MEMEX_GMI_PROPERTY = System.getProperty("memex-gmi-resources", "/var/lib/wmsa/memex-gmi"); + + private static final boolean MEMEX_DISABLE_GIT = Boolean.getBoolean("memex-disable-git"); + private static final boolean MEMEX_DISABLE_GEMINI = Boolean.getBoolean("memex-disable-gemini"); + + @SneakyThrows + public MemexConfigurationModule() { + Thread.sleep(100); + } + public void configure() { - bind(Path.class).annotatedWith(Names.named("memex-root")).toInstance(Path.of("/var/lib/wmsa/memex")); - bind(Path.class).annotatedWith(Names.named("memex-html-resources")).toInstance(Path.of("/var/lib/wmsa/memex-html")); - bind(Path.class).annotatedWith(Names.named("memex-gmi-resources")).toInstance(Path.of("/var/lib/wmsa/memex-gmi")); + bind(Path.class).annotatedWith(Names.named("memex-root")).toInstance(Path.of(MEMEX_ROOT_PROPERTY)); + bind(Path.class).annotatedWith(Names.named("memex-html-resources")).toInstance(Path.of(MEMEX_HTML_PROPERTY)); + bind(Path.class).annotatedWith(Names.named("memex-gmi-resources")).toInstance(Path.of(MEMEX_GMI_PROPERTY)); + bind(String.class).annotatedWith(Names.named("tombestone-special-file")).toInstance("/special/tombstone.gmi"); bind(String.class).annotatedWith(Names.named("redirects-special-file")).toInstance("/special/redirect.gmi"); + switchImpl(MemexGitRepo.class, MEMEX_DISABLE_GIT, MemexGitRepoDummy.class, MemexGitRepoImpl.class); + switchImpl(GeminiService.class, MEMEX_DISABLE_GEMINI, GeminiServiceDummy.class, GeminiServiceImpl.class); + bind(MemexFileWriter.class).annotatedWith(Names.named("html")).toProvider(MemexHtmlWriterProvider.class); bind(MemexFileWriter.class).annotatedWith(Names.named("gmi")).toProvider(MemexGmiWriterProvider.class); } - + void switchImpl(Class impl, boolean param, Class ifEnabled, Class ifDisabled) { + final Class choice; + if (param) { + choice = ifEnabled; + } + else { + choice = ifDisabled; + } + bind(impl).to(choice).asEagerSingleton(); + } public static class MemexHtmlWriterProvider implements Provider { private final Path path; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexMain.java index e58848d2..f46ce4d1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexMain.java @@ -18,7 +18,7 @@ public class MemexMain extends MainClass { } public static void main(String... args) { - init(ServiceDescriptor.EDGE_MEMEX, args); + init(ServiceDescriptor.MEMEX, args); Injector injector = Guice.createInjector( new MemexConfigurationModule(), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexService.java index 4d22f1af..16440960 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexService.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.memex; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; +import nu.marginalia.gemini.gmi.GemtextDocument; import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; import nu.marginalia.wmsa.auth.client.AuthClient; import nu.marginalia.wmsa.configuration.server.Context; @@ -10,12 +11,11 @@ import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.memex.change.GemtextMutation; -import nu.marginalia.gemini.gmi.GemtextDocument; import nu.marginalia.wmsa.memex.change.update.GemtextDocumentUpdateCalculator; -import nu.marginalia.wmsa.memex.renderer.MemexHtmlRenderer; import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; import nu.marginalia.wmsa.memex.model.MemexNodeUrl; import nu.marginalia.wmsa.memex.model.render.*; +import nu.marginalia.wmsa.memex.renderer.MemexHtmlRenderer; import org.apache.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,9 +49,18 @@ public class MemexService extends Service { MemexHtmlRenderer renderer, AuthClient authClient, Initialization initialization, - MetricsServer metricsServer) { + MetricsServer metricsServer, + @Named("memex-html-resources") Path memexHtmlDir + ) { - super(ip, port, initialization, metricsServer); + super(ip, port, initialization, metricsServer, () -> { + staticFiles.externalLocation(memexHtmlDir.toString()); + staticFiles.disableMimeTypeGuessing(); + staticFiles.registerMimeType("gmi", "text/html"); + staticFiles.registerMimeType("png", "text/html"); + staticFiles.expireTime(60); + staticFiles.header("Cache-control", "public,proxy-revalidate"); + }); this.updateCalculator = updateCalculator; this.memex = memex; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/client/MemexApiClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/client/MemexApiClient.java index b98b34c7..b038637d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/client/MemexApiClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/client/MemexApiClient.java @@ -8,7 +8,7 @@ import nu.marginalia.wmsa.configuration.ServiceDescriptor; public class MemexApiClient extends AbstractDynamicClient { @Inject public MemexApiClient() { - super(ServiceDescriptor.EDGE_MEMEX); + super(ServiceDescriptor.MEMEX); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexSourceFileSystem.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexSourceFileSystem.java index c72e2383..9d165272 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexSourceFileSystem.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexSourceFileSystem.java @@ -4,11 +4,15 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.wmsa.memex.system.git.MemexGitRepo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; @Singleton public class MemexSourceFileSystem { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepo.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepo.java new file mode 100644 index 00000000..d4e55491 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepo.java @@ -0,0 +1,15 @@ +package nu.marginalia.wmsa.memex.system.git; + +import nu.marginalia.wmsa.memex.model.MemexNodeUrl; + +public interface MemexGitRepo { + void pull(); + + void remove(MemexNodeUrl url); + + void add(MemexNodeUrl url); + + void update(MemexNodeUrl url); + + void rename(MemexNodeUrl src, MemexNodeUrl dst); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoDummy.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoDummy.java new file mode 100644 index 00000000..4d5116ff --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoDummy.java @@ -0,0 +1,36 @@ +package nu.marginalia.wmsa.memex.system.git; + +import com.google.inject.Singleton; +import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Singleton +public class MemexGitRepoDummy implements MemexGitRepo { + private static final Logger logger = LoggerFactory.getLogger(MemexGitRepoDummy.class); + + @Override + public void pull() { + logger.info("Would perform a pull here"); + } + + @Override + public void remove(MemexNodeUrl url) { + logger.info("Would perform a remove here"); + } + + @Override + public void add(MemexNodeUrl url) { + logger.info("Would perform an add here"); + } + + @Override + public void update(MemexNodeUrl url) { + logger.info("Would perform an update here"); + } + + @Override + public void rename(MemexNodeUrl src, MemexNodeUrl dst) { + logger.info("Would perform a rename here"); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexGitRepo.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoImpl.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexGitRepo.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoImpl.java index 05ca6603..10c72060 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexGitRepo.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoImpl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.system; +package nu.marginalia.wmsa.memex.system.git; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -10,7 +10,8 @@ import org.eclipse.jgit.api.Git; import org.eclipse.jgit.api.errors.GitAPIException; import org.eclipse.jgit.lib.Repository; import org.eclipse.jgit.storage.file.FileRepositoryBuilder; -import org.eclipse.jgit.transport.*; +import org.eclipse.jgit.transport.JschConfigSessionFactory; +import org.eclipse.jgit.transport.SshSessionFactory; import org.eclipse.jgit.util.FS; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -19,13 +20,13 @@ import java.io.IOException; import java.nio.file.Path; @Singleton -public class MemexGitRepo { +public class MemexGitRepoImpl implements MemexGitRepo { private final Git git; - private final Logger logger = LoggerFactory.getLogger(MemexGitRepo.class); + private final Logger logger = LoggerFactory.getLogger(MemexGitRepoImpl.class); @Inject - public MemexGitRepo(@Named("memex-root") Path root) throws IOException { + public MemexGitRepoImpl(@Named("memex-root") Path root) throws IOException { FileRepositoryBuilder repositoryBuilder = new FileRepositoryBuilder(); @@ -49,6 +50,7 @@ public class MemexGitRepo { pull(); } + @Override public void pull() { try { git.pull().call(); @@ -58,6 +60,7 @@ public class MemexGitRepo { } } + @Override public void remove(MemexNodeUrl url) { try { git.rm() @@ -72,6 +75,7 @@ public class MemexGitRepo { } } + @Override public void add(MemexNodeUrl url) { try { git.add() @@ -87,6 +91,7 @@ public class MemexGitRepo { logger.error("Git operation failed", ex); } } + @Override public void update(MemexNodeUrl url) { try { git.add() @@ -105,6 +110,7 @@ public class MemexGitRepo { } + @Override public void rename(MemexNodeUrl src, MemexNodeUrl dst) { try { git.rm().addFilepattern(filePattern(src)).call(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java index 9699bcf9..e3e670c7 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java @@ -2,16 +2,18 @@ package nu.marginalia.wmsa.memex.change; import io.reactivex.rxjava3.plugins.RxJavaPlugins; import lombok.SneakyThrows; -import nu.marginalia.gemini.GeminiService; +import nu.marginalia.gemini.GeminiServiceImpl; import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.memex.*; +import nu.marginalia.wmsa.memex.Memex; +import nu.marginalia.wmsa.memex.MemexData; +import nu.marginalia.wmsa.memex.MemexLoader; import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; import nu.marginalia.wmsa.memex.model.MemexNodeUrl; import nu.marginalia.wmsa.memex.renderer.MemexRendererers; import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.memex.system.MemexGitRepo; import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; +import nu.marginalia.wmsa.memex.system.git.MemexGitRepoImpl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -61,13 +63,13 @@ class GemtextChangeTest { var data = new MemexData(); memex = new Memex(data, null, - Mockito.mock(MemexGitRepo.class), new MemexLoader(data, new MemexFileSystemModifiedTimes(), - new MemexSourceFileSystem(tempDir, Mockito.mock(MemexGitRepo.class)), + Mockito.mock(MemexGitRepoImpl.class), new MemexLoader(data, new MemexFileSystemModifiedTimes(), + new MemexSourceFileSystem(tempDir, Mockito.mock(MemexGitRepoImpl.class)), tempDir, tombstonePath, redirectPath), Mockito.mock(MemexFileWriter.class), null, Mockito.mock(MemexRendererers.class), - Mockito.mock(GeminiService.class)); + Mockito.mock(GeminiServiceImpl.class)); } @SneakyThrows diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java index 8aefc613..d80d32eb 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java @@ -2,18 +2,20 @@ package nu.marginalia.wmsa.memex.change; import io.reactivex.rxjava3.plugins.RxJavaPlugins; import lombok.SneakyThrows; -import nu.marginalia.gemini.GeminiService; +import nu.marginalia.gemini.GeminiServiceImpl; import nu.marginalia.gemini.gmi.GemtextDocument; import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.memex.*; +import nu.marginalia.wmsa.memex.Memex; +import nu.marginalia.wmsa.memex.MemexData; +import nu.marginalia.wmsa.memex.MemexLoader; import nu.marginalia.wmsa.memex.change.update.GemtextDocumentUpdateCalculator; import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; import nu.marginalia.wmsa.memex.model.MemexNodeUrl; import nu.marginalia.wmsa.memex.renderer.MemexRendererers; import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.memex.system.MemexGitRepo; import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; +import nu.marginalia.wmsa.memex.system.git.MemexGitRepoImpl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -67,12 +69,12 @@ class GemtextTaskUpdateTest { Files.createDirectory(tempDir.resolve("special")); var data = new MemexData(); - memex = new Memex(data, null, Mockito.mock(MemexGitRepo.class), new MemexLoader(data, new MemexFileSystemModifiedTimes(), - new MemexSourceFileSystem(tempDir, Mockito.mock(MemexGitRepo.class)), tempDir, tombstonePath, redirectPath), + memex = new Memex(data, null, Mockito.mock(MemexGitRepoImpl.class), new MemexLoader(data, new MemexFileSystemModifiedTimes(), + new MemexSourceFileSystem(tempDir, Mockito.mock(MemexGitRepoImpl.class)), tempDir, tombstonePath, redirectPath), Mockito.mock(MemexFileWriter.class), null, Mockito.mock(MemexRendererers.class), - Mockito.mock(GeminiService.class)); + Mockito.mock(GeminiServiceImpl.class)); } @SneakyThrows diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java index bfe3b104..51120654 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java @@ -2,15 +2,17 @@ package nu.marginalia.wmsa.memex.change; import io.reactivex.rxjava3.plugins.RxJavaPlugins; import lombok.SneakyThrows; -import nu.marginalia.gemini.GeminiService; +import nu.marginalia.gemini.GeminiServiceImpl; import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.memex.*; +import nu.marginalia.wmsa.memex.Memex; +import nu.marginalia.wmsa.memex.MemexData; +import nu.marginalia.wmsa.memex.MemexLoader; import nu.marginalia.wmsa.memex.model.MemexNodeUrl; import nu.marginalia.wmsa.memex.renderer.MemexRendererers; import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.memex.system.MemexGitRepo; import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; +import nu.marginalia.wmsa.memex.system.git.MemexGitRepoImpl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -64,13 +66,13 @@ class GemtextTombstoneUpdateCaclulatorTest { var data = new MemexData(); memex = new Memex(data, null, - Mockito.mock(MemexGitRepo.class), + Mockito.mock(MemexGitRepoImpl.class), new MemexLoader(data, new MemexFileSystemModifiedTimes(), - new MemexSourceFileSystem(tempDir, Mockito.mock(MemexGitRepo.class)), tempDir, tombstonePath, redirectPath), + new MemexSourceFileSystem(tempDir, Mockito.mock(MemexGitRepoImpl.class)), tempDir, tombstonePath, redirectPath), Mockito.mock(MemexFileWriter.class), updateCaclulator, Mockito.mock(MemexRendererers.class), - Mockito.mock(GeminiService.class)); + Mockito.mock(GeminiServiceImpl.class)); } @SneakyThrows