From 04f905f3a1f3e4a99e7ba6b2145420817c098971 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 3 Feb 2023 13:31:51 +0100 Subject: [PATCH] Reintroduce the ability to filter search results by their ranking. --- .../util/ranking/BuggyReversePageRank.java | 39 ------ .../util/ranking/BuggyStandardPageRank.java | 45 ------ .../util/ranking/tool/DedupTool.java | 89 ------------ .../ranking/tool/UpdateDomainRanksTool.java | 93 ------------- .../wmsa/edge/index/EdgeIndexControl.java | 7 +- .../wmsa/edge/index/IndexServicesFactory.java | 22 +-- .../model/EdgePageDocumentsMetadata.java | 25 +++- .../edge/index/postings/DomainRankings.java | 43 ++++++ .../index/postings/SearchIndexControl.java | 6 +- .../forward/ForwardIndexConverter.java | 19 ++- .../forward/ParamMatchingQueryFilter.java | 15 ++ .../postings/reverse/ReverseIndexReader.java | 5 + .../edge/index/query/IndexQueryParams.java | 1 + .../edge/index}/ranking/RankingAlgorithm.java | 129 ++++-------------- .../index}/ranking/RankingDomainData.java | 2 +- .../index}/ranking/RankingDomainFetcher.java | 2 +- .../edge/index/ranking/ReversePageRank.java} | 8 +- .../edge/index/ranking/StandardPageRank.java} | 9 +- .../accumulator/RankingResultAccumulator.java | 6 + .../RankingResultBitSetAccumulator.java | 17 +++ .../RankingResultHashMapAccumulator.java | 21 +++ .../RankingResultListAccumulator.java | 24 ++++ .../ranking/old/OldReversePageRankV2.java | 2 +- .../index}/ranking/old/StandardPageRank.java | 5 +- .../index}/ranking/tool/PerusePageRankV2.java | 10 +- .../ranking/tool/UpdateDomainRanksTool2.java | 20 +-- .../edge/index/svc/EdgeIndexQueryService.java | 1 + .../index/svc/EdgeIndexSearchSetsService.java | 112 ++++----------- .../search/EdgeSearchResultKeywordScore.java | 5 + .../model/search/EdgeSearchSpecification.java | 1 + .../wmsa/edge/search/query/QueryFactory.java | 7 + .../wmsa/edge/search/query/QueryParser.java | 3 + .../templates/edge/parts/search-footer.hdb | 3 + .../model/EdgePageDocumentsMetadataTest.java | 26 +++- .../forward/ForwardIndexConverterTest.java | 10 +- .../service/EdgeIndexIntegrationTest.java | 5 +- .../EdgeIndexIntegrationTestModule.java | 4 +- 37 files changed, 305 insertions(+), 536 deletions(-) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/RankingAlgorithm.java (67%) rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/RankingDomainData.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/RankingDomainFetcher.java (99%) rename marginalia_nu/src/main/java/nu/marginalia/{util/ranking/BetterReversePageRank.java => wmsa/edge/index/ranking/ReversePageRank.java} (84%) rename marginalia_nu/src/main/java/nu/marginalia/{util/ranking/BetterStandardPageRank.java => wmsa/edge/index/ranking/StandardPageRank.java} (77%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/old/OldReversePageRankV2.java (99%) rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/old/StandardPageRank.java (98%) rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/tool/PerusePageRankV2.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/tool/UpdateDomainRanksTool2.java (80%) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java deleted file mode 100644 index 485ba353..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java +++ /dev/null @@ -1,39 +0,0 @@ -package nu.marginalia.util.ranking; - - -public class BuggyReversePageRank extends RankingAlgorithm { - - - public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); - } - - @Override - RankVector createNewRankVector(RankVector rank) { - - double rankNorm = rank.norm(); - RankVector newRank = new RankVector(0); - - for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) { - - var links = linkDataSrc2Dest[domainId]; - - if (links != null && links.size() > 0) { - double newRankValue = 0; - - for (int j = 0; j < links.size(); j++) { - newRankValue += rank.get(links.getQuick(j)) / links.size(); - } - - newRank.set(domainId, 0.85*newRankValue/rankNorm); - } - } - return newRank; - } - - @Override - void adjustRankVector(RankVector vector, double dNorm, double oldNorm) { - originDomainIds.forEach(id -> vector.increment(domainIdToIndex.get(id), dNorm/oldNorm)); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java deleted file mode 100644 index 836bcdfe..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java +++ /dev/null @@ -1,45 +0,0 @@ -package nu.marginalia.util.ranking; - - -public class BuggyStandardPageRank extends RankingAlgorithm { - - public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); - } - - @Override - RankingAlgorithm.RankVector createNewRankVector(RankingAlgorithm.RankVector rank) { - RankVector newRank = new RankVector(0); - - for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) { - - var links = linkDataSrc2Dest[domainId]; - double newRankValue = 0; - - if (links != null && links.size() > 0) { - for (int j = 0; j < links.size(); j++) { - int linkedDomain = links.getQuick(j); - - int linkSize = 1; - var bl = linkDataSrc2Dest[linkedDomain]; - if (bl != null) { - linkSize = bl.size(); - } - - newRankValue += rank.get(linkedDomain) / linkSize; - - } - } - - newRank.set(domainId, 0.85 * newRankValue); - } - return newRank; - } - - @Override - void adjustRankVector(RankingAlgorithm.RankVector vector, double dNorm, double oldNorm) { - originDomainIds.forEach(id -> vector.increment(id, dNorm/originDomainIds.size())); - vector.incrementAll(0.14*dNorm/vector.size()); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java deleted file mode 100644 index d6f95f51..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java +++ /dev/null @@ -1,89 +0,0 @@ -package nu.marginalia.util.ranking.tool; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.SneakyThrows; -import lombok.ToString; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import org.mariadb.jdbc.Driver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.*; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class DedupTool { - - private static final Logger logger = LoggerFactory.getLogger(DedupTool.class); - - public Set originDomains = new HashSet<>(); - public Set originDomainIds = new HashSet<>(); - public final long domainIdMax = -1; - public int domainCount; - private volatile static int rankMax; - - public int maxId() { - return (int) domainIdMax; - } - public int domainCount() { - return domainCount; - } - - static LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); - volatile static boolean running = true; - - @AllArgsConstructor @ToString @Getter - static class Data { - String url; - int id; - String domain; - } - - @SneakyThrows - public static void main(String... args) { - Driver driver = new Driver(); - var ds = new DatabaseModule().provideConnection(); - - Map>> domainToHashToUrl = new HashMap<>(); - - try (var conn = ds.getConnection(); - var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL"); - var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?") - - ) { - fetchStmt.setFetchSize(10_000); - var rsp = fetchStmt.executeQuery(); - while (rsp.next()) { - domainToHashToUrl.computeIfAbsent(rsp.getInt(1), i -> new HashMap<>()) - .computeIfAbsent(rsp.getInt(2), i -> new ArrayList<>()).add(new Data(rsp.getString(3), rsp.getInt(4), rsp.getString(5))); - } - - - List updateIds = new ArrayList<>(); - - domainToHashToUrl.forEach((domain, hashes) -> { - hashes.forEach((hash, urls) -> { - if (urls.size() > 1) { - Comparator c = Comparator.comparing(d -> d.domain.length()); - var urls2 = urls.stream().sorted(c.thenComparing(d -> d.url.length())) - .collect(Collectors.partitioningBy(d -> d.url.endsWith("/"))); - - Stream - .concat(urls2.get(true).stream(),urls2.get(false).stream()).skip(1) - .map(Data::getId) - .forEach(updateIds::add); - } - }); - }); - - for (int id : updateIds) { - updateStmt.setInt(1, id); - updateStmt.executeUpdate(); - } - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java deleted file mode 100644 index e251092f..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java +++ /dev/null @@ -1,93 +0,0 @@ -package nu.marginalia.util.ranking.tool; - -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.util.ranking.BuggyStandardPageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; -import org.mariadb.jdbc.Driver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.HashSet; -import java.util.Set; -import java.util.concurrent.LinkedBlockingQueue; - -public class UpdateDomainRanksTool { - - private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class); - - public Set originDomains = new HashSet<>(); - public Set originDomainIds = new HashSet<>(); - public final long domainIdMax = -1; - public int domainCount; - private volatile static int rankMax; - - public int maxId() { - return (int) domainIdMax; - } - public int domainCount() { - return domainCount; - } - - static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); - volatile static boolean running = true; - - @SneakyThrows - public static void main(String... args) { - org.mariadb.jdbc.Driver driver = new Driver(); - var conn = new DatabaseModule().provideConnection(); - - long start = System.currentTimeMillis(); - var uploader = new Thread(() -> uploadThread(conn), "Uploader"); - - logger.info("Ranking"); - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu"); - - rankMax = spr.size()*2; - uploader.start(); - - var rankData = spr.pageRankWithPeripheralNodes(rankMax); - for (int i : rankData) { - try { - uploadQueue.put(i); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - - long end = System.currentTimeMillis(); - running = false; - uploader.join(); - - logger.info("Done in {}", (end - start)/1000.0); - } - - public static void uploadThread(HikariDataSource dataSource) { - int i = 0; - - try (var conn = dataSource.getConnection()) { - logger.info("Resetting rank"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) { - stmt.executeUpdate(); - } - - logger.info("Updating ranks"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) { - while (running || (!running && !uploadQueue.isEmpty())) { - var job = uploadQueue.take(); - stmt.setDouble(1, i++ / (double) rankMax); - stmt.setInt(2, job); - stmt.executeUpdate(); - } - } - - } catch (SQLException | InterruptedException throwables) { - throwables.printStackTrace(); - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index 980d0d32..87f65926 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.Inject; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; import java.io.IOException; @@ -9,14 +10,16 @@ import java.io.IOException; public class EdgeIndexControl { private final IndexServicesFactory servicesFactory; + private final EdgeIndexSearchSetsService searchSetsService; @Inject - public EdgeIndexControl(IndexServicesFactory servicesFactory) { + public EdgeIndexControl(IndexServicesFactory servicesFactory, EdgeIndexSearchSetsService searchSetsService) { this.servicesFactory = servicesFactory; + this.searchSetsService = searchSetsService; } public void regenerateIndex() throws IOException { - servicesFactory.convertIndex(); + servicesFactory.convertIndex(searchSetsService.getDomainRankings()); System.gc(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index e81a1682..10bb1db1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.postings.SearchIndex; import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader; import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter; @@ -20,6 +21,7 @@ import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter; import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader; import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters; import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,7 +35,6 @@ import java.util.concurrent.Callable; @Singleton public class IndexServicesFactory { private final Path tmpFileDir; - private final EdgeDomainBlacklist domainBlacklist; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -58,12 +59,10 @@ public class IndexServicesFactory { public IndexServicesFactory( @Named("tmp-file-dir") Path tmpFileDir, @Named("partition-root-slow") Path partitionRootSlow, - @Named("partition-root-fast") Path partitionRootFast, - EdgeDomainBlacklist domainBlacklist + @Named("partition-root-fast") Path partitionRootFast ) throws IOException { this.tmpFileDir = tmpFileDir; - this.domainBlacklist = domainBlacklist; this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat"); this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat"); @@ -106,8 +105,8 @@ public class IndexServicesFactory { } - public void convertIndex() throws IOException { - convertForwardIndex(); + public void convertIndex(DomainRankings domainRankings) throws IOException { + convertForwardIndex(domainRankings); convertFullReverseIndex(); convertPriorityReverseIndex(); @@ -148,13 +147,14 @@ public class IndexServicesFactory { tryGc(); } - private void convertForwardIndex() throws IOException { + private void convertForwardIndex(DomainRankings domainRankings) throws IOException { logger.info("Converting forward index data"); - new ForwardIndexConverter(tmpFileDir, + new ForwardIndexConverter( writerIndexFile.get(0), fwdIndexDocId.get(NEXT_PART).toPath(), - fwdIndexDocData.get(NEXT_PART).toPath()) + fwdIndexDocData.get(NEXT_PART).toPath(), + domainRankings) .convert(); tryGc(); @@ -212,8 +212,8 @@ public class IndexServicesFactory { } } - public SearchIndex createIndexBucket() { - return new SearchIndex(this, new EdgeIndexControl(this)); + public SearchIndex createIndexBucket(EdgeIndexSearchSetsService searchSetsService) { + return new SearchIndex(this, new EdgeIndexControl(this, searchSetsService)); } public SearchIndexReader getSearchIndexReader() throws IOException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java index 4847d9fc..4331131f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java @@ -8,7 +8,8 @@ import java.util.Set; import static java.lang.Math.max; import static java.lang.Math.min; -public record EdgePageDocumentsMetadata(int encSize, +public record EdgePageDocumentsMetadata(int rank, + int encSize, int topology, int year, int sets, @@ -16,9 +17,13 @@ public record EdgePageDocumentsMetadata(int encSize, byte flags) { + public static final long RANK_MASK = 0xFFL; + public static final int RANK_SHIFT = 48; + public static final long ENCSIZE_MASK = 0xFFL; - public static final int ENCSIZE_SHIFT = 48; + public static final int ENCSIZE_SHIFT = 40; public static final int ENCSIZE_MULTIPLIER = 50; + public static final long TOPOLOGY_MASK = 0xFFL; public static final int TOPOLOGY_SHIFT = 32; @@ -39,7 +44,7 @@ public record EdgePageDocumentsMetadata(int encSize, this(defaultValue()); } public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet flags) { - this(0, topology, year, sets, quality, encodeFlags(flags)); + this(0, 0, topology, year, sets, quality, encodeFlags(flags)); } public EdgePageDocumentsMetadata withSize(int size) { @@ -49,7 +54,7 @@ public record EdgePageDocumentsMetadata(int encSize, final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER)); - return new EdgePageDocumentsMetadata(encSize, topology, year, sets, quality, flags); + return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags); } private static byte encodeFlags(Set flags) { @@ -63,7 +68,8 @@ public record EdgePageDocumentsMetadata(int encSize, } public EdgePageDocumentsMetadata(long value) { - this( (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK), + this( (int) ((value >>> RANK_SHIFT) & RANK_MASK), + (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK), (int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK), (int) ((value >>> YEAR_SHIFT) & YEAR_MASK), (int) ((value >>> SETS_SHIFT) & SETS_MASK), @@ -84,12 +90,13 @@ public record EdgePageDocumentsMetadata(int encSize, ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT; ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT; ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT; + ret |= min(RANK_MASK, max(0, rank)) << RANK_SHIFT; return ret; } public boolean isEmpty() { - return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0; + return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0 && rank == 0; } public static int decodeQuality(long encoded) { @@ -112,6 +119,12 @@ public record EdgePageDocumentsMetadata(int encSize, return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK); } + public static int decodeRank(long encoded) { + return (int) ((encoded >>> RANK_SHIFT) & RANK_MASK); + } + public static long encodeRank(long encoded, int rank) { + return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java new file mode 100644 index 00000000..d6ddcd62 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java @@ -0,0 +1,43 @@ +package nu.marginalia.wmsa.edge.index.postings; + +import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; +import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static java.lang.Math.max; +import static java.lang.Math.min; + +public class DomainRankings { + private final Int2ShortOpenHashMap rankings; + + private final int MAX_MEANINGFUL_RANK = 50_000; + private final int MAX_RANK_VALUE = 255; + private final int MIN_RANK_VALUE = 1; + private final double RANK_SCALING_FACTOR = (double) MAX_RANK_VALUE / MAX_MEANINGFUL_RANK; + + public DomainRankings() { + rankings = new Int2ShortOpenHashMap(); + } + public DomainRankings(Int2IntOpenHashMap values) { + rankings = new Int2ShortOpenHashMap(values.size()); + values.forEach(this::putRanking); + } + + private void putRanking(int domainId, int value) { + rankings.put(domainId, scaleRank(value)); + } + + private short scaleRank(int value) { + double rankScaled = RANK_SCALING_FACTOR * value; + return (short) min(MAX_RANK_VALUE, max(MIN_RANK_VALUE, rankScaled)); + } + + public int getRanking(int domainId) { + return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE); + } + + public int size() { + return rankings.size(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java index a1475af3..42e7e32f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java @@ -6,6 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,13 +27,14 @@ public class SearchIndexControl { @Inject public SearchIndexControl(IndexServicesFactory servicesFactory, - EdgeOpsLockService opsLockService) { + EdgeOpsLockService opsLockService, + EdgeIndexSearchSetsService searchSetsService) { this.servicesFactory = servicesFactory; this.primaryIndexWriter = servicesFactory.getIndexWriter(0); this.secondaryIndexWriter = servicesFactory.getIndexWriter(1); - index = servicesFactory.createIndexBucket(); + index = servicesFactory.createIndexBucket(searchSetsService); this.opsLockService = opsLockService; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java index e066f734..8d821c88 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java @@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.index.postings.forward; import com.upserve.uppend.blobs.NativeIO; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import nu.marginalia.util.array.LongArray; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader; import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; import org.roaringbitmap.IntConsumer; @@ -18,26 +20,26 @@ import java.nio.file.Path; import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*; public class ForwardIndexConverter { - private static final int RWF_BIN_SIZE = 10_000_000; - private final Path tmpFileDir; private final File inputFile; private final Logger logger = LoggerFactory.getLogger(getClass()); private final Path outputFileDocsId; private final Path outputFileDocsData; + private final DomainRankings domainRankings; - public ForwardIndexConverter(Path tmpFileDir, + public ForwardIndexConverter( File inputFile, Path outputFileDocsId, - Path outputFileDocsData + Path outputFileDocsData, + DomainRankings domainRankings ) { - this.tmpFileDir = tmpFileDir; this.inputFile = inputFile; this.outputFileDocsId = outputFileDocsId; this.outputFileDocsData = outputFileDocsData; + this.domainRankings = domainRankings; } public void convert() throws IOException { @@ -50,6 +52,8 @@ public class ForwardIndexConverter { logger.info("Converting {} {}",inputFile, journalReader.fileHeader); + logger.info("Domain Rankings size = {}", domainRankings.size()); + try { LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); @@ -68,7 +72,10 @@ public class ForwardIndexConverter { journalReader.forEach(entry -> { long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId()); - docFileData.set(entryOffset + METADATA_OFFSET, entry.docMeta()); + int ranking = domainRankings.getRanking(entry.domainId()); + long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking); + + docFileData.set(entryOffset + METADATA_OFFSET, meta); docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId()); }); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java index a3c30bab..67c1b9e2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java @@ -33,6 +33,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { if (!validateSize(post)) { return false; } + + if (!validateRank(post)) { + return false; + } + return true; } @@ -51,6 +56,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { return limit.test(quality); } + private boolean validateYear(ForwardIndexReader.DocPost post) { if (params.year().type() == SpecificationLimitType.NONE) return true; @@ -69,6 +75,15 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { return params.size().test(postVal); } + private boolean validateRank(ForwardIndexReader.DocPost post) { + if (params.rank().type() == SpecificationLimitType.NONE) + return true; + + int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta()); + + return params.rank().test(postVal); + } + @Override public double cost() { return 32; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java index 5679c5be..6f4475e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java @@ -53,6 +53,11 @@ public class ReverseIndexReader { } public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) { + if (null == words) { + logger.warn("Reverse index is not ready, dropping query"); + return new EmptyEntrySource(); + } + if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource(); long offset = words.get(wordId); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java index 298e6c01..031410fc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java @@ -7,6 +7,7 @@ import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; public record IndexQueryParams(SpecificationLimit qualityLimit, SpecificationLimit year, SpecificationLimit size, + SpecificationLimit rank, SearchSet searchSet, QueryStrategy queryStrategy ) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java index ca6f7b62..94a89c15 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java @@ -1,21 +1,19 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking; -import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; import it.unimi.dsi.fastutil.ints.IntArrays; -import it.unimi.dsi.fastutil.ints.IntComparator; -import org.roaringbitmap.RoaringBitmap; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Arrays; -import java.util.Comparator; import java.util.HashSet; import java.util.Set; -import java.util.function.IntToDoubleFunction; -import java.util.stream.IntStream; +import java.util.function.Supplier; + +import static java.lang.Math.min; public abstract class RankingAlgorithm { protected final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); @@ -133,29 +131,7 @@ public abstract class RankingAlgorithm { return domainsById.size(); } - - public RankVector pageRankVector() { - RankVector rank = new RankVector(1.d / domainsById.size()); - - int iter_max = 100; - for (int i = 0; i < iter_max; i++) { - RankVector newRank = createNewRankVector(rank); - - double oldNorm = rank.norm(); - double newNorm = newRank.norm(); - double dNorm = oldNorm - newNorm ; - if (i < iter_max-1) { - adjustRankVector(newRank, dNorm, oldNorm); - } - - rank = newRank; - } - - return rank; - } - - - public RoaringBitmap pageRank(int resultCount) { + public T pageRank(int resultCount, Supplier> accumulatorP) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; @@ -174,10 +150,10 @@ public abstract class RankingAlgorithm { } - return rank.getRanking(resultCount); + return rank.getRanking(resultCount, accumulatorP).get(); } - public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) { + public T pageRankWithPeripheralNodes(int resultCount, Supplier> accumulatorP) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; @@ -201,32 +177,11 @@ public abstract class RankingAlgorithm { logger.info("PRWPN iteration done"); - return rank.getRanking(resultCount); + return rank.getRanking(resultCount, accumulatorP).get(); } abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm); - public TIntList pageRank(IntToDoubleFunction weight, int resultCount) { - RankVector rank = new RankVector(1.d / domainsById.size()); - - int iter_max = 100; - for (int i = 0; i < iter_max; i++) { - RankVector newRank = createNewRankVector(rank); - - double oldNorm = rank.norm(); - double newNorm = newRank.norm(); - double dNorm = oldNorm - newNorm ; - - if (i < iter_max-1) { - adjustRankVector(newRank, dNorm, oldNorm); - } - - rank = newRank; - } - - return rank.getRanking(weight, resultCount); - } - abstract RankVector createNewRankVector(RankVector rank); public boolean includeInRanking(RankingDomainData data) { @@ -271,9 +226,8 @@ public abstract class RankingAlgorithm { public double norm() { double v = 0.; - for (int i = 0; i < rank.length; i++) { - if (rank[i] > 0) { v+=rank[i]; } - else { v -= rank[i]; } + for (double value : rank) { + v += Math.abs(value); } return v; } @@ -281,73 +235,38 @@ public abstract class RankingAlgorithm { public double norm(RankVector other) { double v = 0.; for (int i = 0; i < rank.length; i++) { - double dv = rank[i] - other.get(i); - - if (dv > 0) { v+=dv; } - else { v -= dv; } + v += Math.abs(rank[i] - other.get(i)); } return v; } - public TIntList getRanking(IntToDoubleFunction other, int numResults) { - TIntArrayList list = new TIntArrayList(numResults); + public RankingResultAccumulator getRanking(int numResults, Supplier> accumulatorP) { - Comparator comparator = Comparator.comparing(i -> Math.sqrt(other.applyAsDouble(domainIdToIndex.get(i)) * rank[i])); - - IntStream.range(0, rank.length) - .boxed() - .sorted(comparator.reversed()) - .map(domainIndexToId::get) - .limit(numResults) - .forEach(list::add); - - return list; - } - - public RoaringBitmap getRanking(int numResults) { if (numResults < 0) { numResults = domainIdToIndex.size(); } - if (numResults >= rank.length) { - numResults = rank.length; - } + numResults = min(numResults, min(domainIdToIndex.size(), rank.length)); - RoaringBitmap list = new RoaringBitmap(); + int[] nodes = sortOrder(rank); + var accumulator = accumulatorP.get(); - int[] nodes = new int[rank.length]; - Arrays.setAll(nodes, i->i); - IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]); - IntArrays.quickSort(nodes, comp); - - int i; - - for (i = 0; i < numResults; i++) { + for (int i = 0; i < numResults; i++) { int id = domainIndexToId.get(nodes[i]); if (includeInRanking(domainsById.get(id))) - list.add(id); + accumulator.add(id, i); } - for (; i < nodes.length && domainsById.size() < numResults; i++) { - int id = domainIndexToId.get(nodes[i]); - - if (includeInRanking(domainsById.get(id))) - list.add(id); - } - - - return list; + return accumulator; } + private static int[] sortOrder(double[] values) { - public void incrementAll(double v) { - for (int i = 0; i < rank.length; i++) { - rank[i]+=v; - } - } + int[] ret = new int[values.length]; + Arrays.setAll(ret, i->i); + IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i])); - int size() { - return domainsById.size(); + return ret; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainData.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainData.java index 2a4b0f65..d72da886 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainData.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking; import lombok.AllArgsConstructor; import lombok.Data; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainFetcher.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainFetcher.java index 1c2e6849..70be6c15 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainFetcher.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java index 7d3b17c4..bb51ca77 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java @@ -1,10 +1,10 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking; -public class BetterReversePageRank extends RankingAlgorithm { +public class ReversePageRank extends RankingAlgorithm { - public BetterReversePageRank(RankingDomainFetcher domains, String... origins) { + public ReversePageRank(RankingDomainFetcher domains, String... origins) { super(domains, origins); } @@ -20,8 +20,6 @@ public class BetterReversePageRank extends RankingAlgorithm { double newRankValue = 0; if (links != null && links.size() > 0) { - - for (int j = 0; j < links.size(); j++) { var revLinks = linkDataDest2Src[links.getQuick(j)]; newRankValue += rank.get(links.getQuick(j)) / revLinks.size(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java similarity index 77% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java index f1f9b0b1..2319f299 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking; -public class BetterStandardPageRank extends RankingAlgorithm { +public class StandardPageRank extends RankingAlgorithm { - public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) { + public StandardPageRank(RankingDomainFetcher domains, String... origins) { super(domains, origins); } @@ -38,8 +38,7 @@ public class BetterStandardPageRank extends RankingAlgorithm { @Override void adjustRankVector(RankVector vector, double dNorm, double oldNorm) { - originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() /* dNorm/originDomainIds.size() */ )); -// vector.incrementAll(0.14*dNorm/vector.size()); + originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() )); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java new file mode 100644 index 00000000..fea37b00 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java @@ -0,0 +1,6 @@ +package nu.marginalia.wmsa.edge.index.ranking.accumulator; + +public interface RankingResultAccumulator { + void add(int domainId, int rank); + T get(); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java new file mode 100644 index 00000000..26e72522 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java @@ -0,0 +1,17 @@ +package nu.marginalia.wmsa.edge.index.ranking.accumulator; + +import org.roaringbitmap.RoaringBitmap; + +public class RankingResultBitSetAccumulator implements RankingResultAccumulator { + private final RoaringBitmap result = new RoaringBitmap(); + + @Override + public void add(int domainId, int rank) { + result.add(domainId); + } + + @Override + public RoaringBitmap get() { + return result; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java new file mode 100644 index 00000000..653806ed --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java @@ -0,0 +1,21 @@ +package nu.marginalia.wmsa.edge.index.ranking.accumulator; + +import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; + +public class RankingResultHashMapAccumulator implements RankingResultAccumulator { + private final Int2IntOpenHashMap result; + + public RankingResultHashMapAccumulator(int size) { + result = new Int2IntOpenHashMap(size); + } + + @Override + public void add(int domainId, int rank) { + result.put(domainId, rank); + } + + @Override + public Int2IntOpenHashMap get() { + return result; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java new file mode 100644 index 00000000..663483e4 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java @@ -0,0 +1,24 @@ +package nu.marginalia.wmsa.edge.index.ranking.accumulator; + +import gnu.trove.list.array.TIntArrayList; + +public class RankingResultListAccumulator implements RankingResultAccumulator { + private final TIntArrayList result; + + public RankingResultListAccumulator(int size) { + result = new TIntArrayList(size); + } + public RankingResultListAccumulator() { + result = new TIntArrayList(10_000); + } + + @Override + public void add(int domainId, int rank) { + result.add(domainId); + } + + @Override + public TIntArrayList get() { + return result; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/OldReversePageRankV2.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/OldReversePageRankV2.java index 02823563..59fcda0d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/OldReversePageRankV2.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.ranking.old; +package nu.marginalia.wmsa.edge.index.ranking.old; import com.zaxxer.hikari.HikariDataSource; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/StandardPageRank.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/StandardPageRank.java index 74bef70a..cd58f7be 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/StandardPageRank.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.ranking.old; +package nu.marginalia.wmsa.edge.index.ranking.old; import com.zaxxer.hikari.HikariDataSource; @@ -125,7 +125,6 @@ public class StandardPageRank { final TIntArrayList empty = new TIntArrayList(); - double rankNorm = rank.norm(); RankVector newRank = new RankVector(0); for (DomainData domain : domains.valueCollection()) { @@ -176,8 +175,6 @@ public class StandardPageRank { } }); } - - TIntHashSet deadEnds = new TIntHashSet(domains.size()); } private class RankVector { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java index 89c1dfb9..409a92ad 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.ranking.tool; +package nu.marginalia.wmsa.edge.index.ranking.tool; import com.zaxxer.hikari.HikariDataSource; @@ -10,9 +10,9 @@ import it.unimi.dsi.fastutil.ints.IntArrays; import it.unimi.dsi.fastutil.ints.IntComparator; import lombok.AllArgsConstructor; import lombok.SneakyThrows; -import nu.marginalia.util.ranking.RankingAlgorithm; -import nu.marginalia.util.ranking.RankingDomainData; -import nu.marginalia.util.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm; +import nu.marginalia.wmsa.edge.index.ranking.RankingDomainData; +import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import org.jetbrains.annotations.NotNull; @@ -33,8 +33,6 @@ public class PerusePageRankV2 { TIntArrayList[] linkDataSrc2Dest; TIntArrayList[] linkDataDest2Src; - private static final boolean getNames = true; - private final Logger logger = LoggerFactory.getLogger(getClass()); static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool2.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool2.java index 55f16a5a..ccb8c15c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool2.java @@ -1,9 +1,10 @@ -package nu.marginalia.util.ranking.tool; +package nu.marginalia.wmsa.edge.index.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.util.ranking.BetterReversePageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank; +import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; @@ -17,8 +18,6 @@ public class UpdateDomainRanksTool2 { private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class); - public final long domainIdMax = -1; - public int domainCount; private volatile static int rankMax; static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); @@ -35,20 +34,21 @@ public class UpdateDomainRanksTool2 { logger.info("Ranking"); var ds = new DatabaseModule().provideConnection(); var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var rpr = new ReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); - var rankVector = rpr.pageRankVector(); rankMax = rpr.size(); uploader.start(); - var rankData = rpr.pageRankWithPeripheralNodes(rankMax); - for (int i : rankData) { + var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new); + + rankData.forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { e.printStackTrace(); } - } + return true; + }); long end = System.currentTimeMillis(); running = false; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java index d04b37b6..5988df3b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -129,6 +129,7 @@ public class EdgeIndexQueryService { specsSet.quality, specsSet.year, specsSet.size, + specsSet.rank, getSearchSet(specsSet), specsSet.queryStrategy); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java index a09047eb..072e39ab 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java @@ -2,51 +2,43 @@ package nu.marginalia.wmsa.edge.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.list.TIntList; -import gnu.trove.list.array.TIntArrayList; import lombok.SneakyThrows; -import nu.marginalia.util.ranking.BetterReversePageRank; -import nu.marginalia.util.ranking.BetterStandardPageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank; +import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; +import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.model.RankingSettings; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; -import org.roaringbitmap.RoaringBitmap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; @Singleton public class EdgeIndexSearchSetsService { - private final HikariDataSource dataSource; - private RankingDomainFetcher rankingDomains; + private final RankingDomainFetcher rankingDomains; private final RankingSettings rankingSettings; - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final SearchSet anySet = new SearchSetAny(); private volatile RankingSearchSet retroSet; private volatile RankingSearchSet smallWebSet; private volatile RankingSearchSet academiaSet; + private volatile DomainRankings domainRankings = new DomainRankings(); + @Inject - public EdgeIndexSearchSetsService(HikariDataSource dataSource, - RankingDomainFetcher rankingDomains, + public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains, RankingSettings rankingSettings, IndexServicesFactory servicesFactory) throws IOException { - this.dataSource = dataSource; this.rankingDomains = rankingDomains; this.rankingSettings = rankingSettings; smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat")); retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat")); - - logger.info("SearchIndexDao ranking settings = {}", rankingSettings); } public void recalculateAll() { @@ -55,52 +47,27 @@ public class EdgeIndexSearchSetsService { updateSmallWebDomains(); } - @SneakyThrows - public RoaringBitmap goodUrls() { - RoaringBitmap domains = new RoaringBitmap(); - RoaringBitmap urls = new RoaringBitmap(); - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) { - stmt.setFetchSize(10_000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - domains.add(rsp.getInt(1)); - } - } - - // For some reason, doing this "INNER JOIN" in Java is significantly faster than doing it in SQL - try (var stmt = connection.prepareStatement("SELECT ID,DOMAIN_ID FROM EC_URL WHERE VISITED AND EC_URL.STATE='OK'")) { - stmt.setFetchSize(10_000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - if (domains.contains(rsp.getInt(2))) { - urls.add(rsp.getInt(1)); - } - } - } - - } - - return urls; - } - @SneakyThrows public void updateRetroDomains() { - var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new)); - var data = spr.pageRankWithPeripheralNodes(spr.size() / 2); + var spr = new StandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new)); + var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new); synchronized (this) { retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data); retroSet.write(); } + + var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000)); + synchronized (this) { + domainRankings = new DomainRankings(ranks); + } } @SneakyThrows public void updateSmallWebDomains() { - var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new)); + var rpr = new ReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new)); rpr.setMaxKnownUrls(750); - var data = rpr.pageRankWithPeripheralNodes(rpr.size()); + var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new); synchronized (this) { smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data); @@ -110,8 +77,8 @@ public class EdgeIndexSearchSetsService { @SneakyThrows public void updateAcademiaDomains() { - var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new)); - var data = spr.pageRankWithPeripheralNodes(spr.size()/2); + var spr = new StandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new)); + var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new); synchronized (this) { academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data); @@ -119,41 +86,8 @@ public class EdgeIndexSearchSetsService { } } - @SneakyThrows - public TIntList getStandardDomains() { - TIntArrayList results = new TIntArrayList(); - - try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement( - """ - SELECT ID FROM EC_DOMAIN - WHERE INDEXED>0 - AND STATE='ACTIVE' - AND DOMAIN_ALIAS IS NULL - ORDER BY ID ASC - """); - ) { - var rs = stmt.executeQuery(); - while (rs.next()) { - results.add(rs.getInt(1)); - } - } - return results; - - } - - @SneakyThrows - public TIntList getSpecialDomains() { - TIntArrayList results = new TIntArrayList(); - try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'") - ) { - var rs = stmt.executeQuery(); - while (rs.next()) { - results.add(rs.getInt(1)); - } - } - return results; + public DomainRankings getDomainRankings() { + return domainRankings; } public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java index 98bf9444..6d97192c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java @@ -24,6 +24,11 @@ public record EdgeSearchResultKeywordScore(int set, sum += 20; } + int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13; + if (rank < 0) + sum += rank / 2; + else + sum += rank / 4; return sum; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java index f60a4b8f..84f133d7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java @@ -19,6 +19,7 @@ public class EdgeSearchSpecification { public final SpecificationLimit quality; public final SpecificationLimit year; public final SpecificationLimit size; + public final SpecificationLimit rank; public final QueryLimits queryLimits; public final QueryStrategy queryStrategy; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index 31ec39bc..952e0fb2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -97,6 +97,7 @@ public class QueryFactory { SpecificationLimit qualityLimit = profile.getQualityLimit(); SpecificationLimit year = profile.getYearLimit(); SpecificationLimit size = profile.getSizeLimit(); + SpecificationLimit rank = SpecificationLimit.none(); for (Token t : basicQuery) { if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) { @@ -116,6 +117,9 @@ public class QueryFactory { if (t.type == TokenType.SIZE_TERM) { size = parseSpecificationLimit(t.str); } + if (t.type == TokenType.RANK_TERM) { + rank = parseSpecificationLimit(t.str); + } if (t.type == TokenType.QS_TERM) { queryStrategy = parseQueryStrategy(t.str); } @@ -154,6 +158,8 @@ public class QueryFactory { case QUALITY_TERM: case YEAR_TERM: case SIZE_TERM: + case RANK_TERM: + case QS_TERM: break; // case NEAR_TERM: near = t.str; @@ -199,6 +205,7 @@ public class QueryFactory { .quality(qualityLimit) .year(year) .size(size) + .rank(rank) .domains(domains) .queryStrategy(queryStrategy) .searchSetIdentifier(profile.searchSetIdentifier); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java index 04b91c88..5551a67a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java @@ -93,6 +93,8 @@ public class QueryParser { entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr)); } else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) { entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); + } else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) { + entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr)); } else if (t.str.startsWith("qs=")) { entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); } else if (t.str.contains(":")) { @@ -508,6 +510,7 @@ enum TokenType implements Predicate { QUALITY_TERM, YEAR_TERM, SIZE_TERM, + RANK_TERM, NEAR_TERM, QS_TERM, diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb index 0126478b..dc0d7157 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb @@ -56,6 +56,9 @@ year=2005(beta) The document was ostensibly published in 2005 year<2005(beta) The document was ostensibly published in or before 2005 + rank>50(beta) The ranking of the website is at least 50 in a span of 1 - 255 + year<50(beta) The ranking of the website is at most 50 in a span of 1 - 255 + format:html5Filter documents using the HTML5 standard. This is typically modern websites. format:xhtmlFilter documents using the XHTML standard format:html123Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java index 9c0d9beb..a3552a85 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java @@ -3,13 +3,15 @@ package nu.marginalia.wmsa.edge.index.model; import org.junit.jupiter.api.Test; +import java.util.EnumSet; + import static org.junit.jupiter.api.Assertions.assertEquals; class EdgePageDocumentsMetadataTest { @Test public void codecYear() { - var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, (byte) 0); + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 192, 0, 0, (byte) 0); long encoded = meta.encode(); var decoded = new EdgePageDocumentsMetadata(encoded); assertEquals(192, decoded.year()); @@ -17,7 +19,7 @@ class EdgePageDocumentsMetadataTest { @Test public void codecTopology() { - var meta = new EdgePageDocumentsMetadata(0, 192, 0, 0, 0, (byte) 0); + var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, 0, (byte) 0); long encoded = meta.encode(); var decoded = new EdgePageDocumentsMetadata(encoded); assertEquals(192, decoded.topology()); @@ -25,7 +27,7 @@ class EdgePageDocumentsMetadataTest { @Test public void codecSets() { - var meta = new EdgePageDocumentsMetadata(0, 0, 0, 14, 0, (byte) 0); + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 14, 0, (byte) 0); long encoded = meta.encode(); var decoded = new EdgePageDocumentsMetadata(encoded); assertEquals(14, decoded.sets()); @@ -33,7 +35,7 @@ class EdgePageDocumentsMetadataTest { @Test public void codecQuality() { - var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 9, (byte) 0); + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 9, (byte) 0); long encoded = meta.encode(); var decoded = new EdgePageDocumentsMetadata(encoded); assertEquals(9, decoded.quality()); @@ -41,7 +43,7 @@ class EdgePageDocumentsMetadataTest { @Test public void codecFlags() { - var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, (byte) 255); + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 0, (byte) 255); long encoded = meta.encode(); System.out.println(Long.toHexString(encoded)); var decoded = new EdgePageDocumentsMetadata(encoded); @@ -57,7 +59,17 @@ class EdgePageDocumentsMetadataTest { assertEquals(50, new EdgePageDocumentsMetadata(0).withSize(4).size()); assertEquals(50, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(4).encode())); - assertEquals(50*255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode())); - assertEquals(50*255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size()); + assertEquals(50 * 255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode())); + assertEquals(50 * 255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size()); + } + + @Test + public void encRank() { + var meta = new EdgePageDocumentsMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class)) + .withSize(0xffffffff).encode(); + var enc2 = EdgePageDocumentsMetadata.encodeRank(meta, 83); + + assertEquals(83, EdgePageDocumentsMetadata.decodeRank(enc2)); + assertEquals(5, EdgePageDocumentsMetadata.decodeTopology(enc2)); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java index 3bea5500..e5652faa 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java @@ -6,6 +6,7 @@ import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.util.test.TestUtil; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; @@ -36,7 +37,6 @@ class ForwardIndexConverterTest { private final Logger logger = LoggerFactory.getLogger(getClass()); Path dataDir; - private Path wordsFile; private Path docsFileId; private Path docsFileData; @@ -71,7 +71,6 @@ class ForwardIndexConverterTest { var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)); - wordsFile = dataDir.resolve("words.dat"); docsFileId = dataDir.resolve("docs-i.dat"); docsFileData = dataDir.resolve("docs-d.dat"); } @@ -104,18 +103,15 @@ class ForwardIndexConverterTest { @Test void testForwardIndex() throws IOException { - Path tmpDir = Path.of("/tmp"); - - new ForwardIndexConverter(tmpDir, indexFile.toFile(), docsFileId, docsFileData).convert(); + new ForwardIndexConverter(indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert(); var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); for (int i = 36; i < workSetSize; i++) { - assertEquals(i % 5, forwardReader.getDocMeta(i)); + assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i)); assertEquals(i/20, forwardReader.getDomainId(i)); } - TestUtil.clearTempDir(dataDir); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java index 27d820ea..0ef29e64 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java @@ -82,6 +82,7 @@ public class EdgeIndexIntegrationTest { .year(SpecificationLimit.none()) .quality(SpecificationLimit.none()) .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) .domains(new ArrayList<>()) .searchSetIdentifier(SearchSetIdentifier.NONE) .subqueries(List.of(new EdgeSearchSubquery( @@ -113,6 +114,7 @@ public class EdgeIndexIntegrationTest { .year(SpecificationLimit.none()) .quality(SpecificationLimit.none()) .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) .queryStrategy(QueryStrategy.SENTENCE) .domains(List.of(2)) .subqueries(List.of(new EdgeSearchSubquery( @@ -139,6 +141,7 @@ public class EdgeIndexIntegrationTest { .quality(SpecificationLimit.none()) .year(SpecificationLimit.equals(1998)) .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) .queryStrategy(QueryStrategy.SENTENCE) .searchSetIdentifier(SearchSetIdentifier.NONE) .subqueries(List.of(new EdgeSearchSubquery( @@ -161,7 +164,7 @@ public class EdgeIndexIntegrationTest { long fullId = id | ((long) (32 - (id % 32)) << 32); - var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, id % 5, id, id % 20, (byte) 0).encode()); + var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java index 2914c4f9..aaa44c35 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java @@ -4,6 +4,7 @@ import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.util.test.TestUtil; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; @@ -42,11 +43,12 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule { System.setProperty("small-ram", "true"); try { bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"), - slowDir, fastDir, null + slowDir, fastDir )); EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class); when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny()); + when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings()); bind(EdgeIndexSearchSetsService.class).toInstance(setsServiceMock);