From ccc5a070818b8a604407c11bbb964ce1db5b0aed Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 19 May 2022 19:13:41 +0200 Subject: [PATCH] Extracted ranking algorithms to separate directory and made them configurable --- .../util/ranking/AcademiaRank.java | 2 +- .../util/ranking/BetterReversePageRank.java | 2 +- .../util/ranking/BetterStandardPageRank.java | 3 +- .../util/ranking/BuggyReversePageRank.java | 2 +- .../util/ranking/BuggyStandardPageRank.java | 2 +- .../util/ranking/RankingAlgorithm.java | 2 +- .../ranking/old/OldReversePageRankV2.java | 2 +- .../util/ranking/old/StandardPageRank.java | 2 +- .../util/ranking/tool/DedupTool.java | 2 +- .../util/ranking/tool/PerusePageRankV2.java | 5 +- .../ranking/tool/TestAcademiaRankTool.java | 4 +- .../ranking/tool/UpdateDomainRanksTool.java | 4 +- .../ranking/tool/UpdateDomainRanksTool2.java | 4 +- .../wmsa/edge/crawler/domain/RssCrawler.java | 2 +- .../CrawlJobExtractorPageRankMain.java | 4 +- .../wmsa/edge/index/EdgeIndexModule.java | 16 ++++++ .../edge/index/model/RankingSettings.java | 28 +++++++++++ .../edge/index/service/SearchIndexDao.java | 23 ++++++--- .../service/query/SearchIndexPartitioner.java | 16 +++--- .../wmsa/edge/tools/IndexMergerMain.java | 3 +- .../processing/SentenceExtractorTest.java | 4 +- .../edge/index/model/RankingSettingsTest.java | 49 +++++++++++++++++++ 22 files changed, 139 insertions(+), 42 deletions(-) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/AcademiaRank.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/BetterReversePageRank.java (95%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/BetterStandardPageRank.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/BuggyReversePageRank.java (95%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/BuggyStandardPageRank.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/RankingAlgorithm.java (99%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/old/OldReversePageRankV2.java (99%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/old/StandardPageRank.java (99%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/tool/DedupTool.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/tool/PerusePageRankV2.java (98%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/tool/TestAcademiaRankTool.java (87%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/tool/UpdateDomainRanksTool.java (95%) rename marginalia_nu/src/main/java/nu/marginalia/{wmsa/edge/index/service => }/util/ranking/tool/UpdateDomainRanksTool2.java (96%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/RankingSettings.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/RankingSettingsTest.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/AcademiaRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/AcademiaRank.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java index b14dc405..272a1798 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/AcademiaRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/AcademiaRank.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking; +package nu.marginalia.util.ranking; import com.zaxxer.hikari.HikariDataSource; import gnu.trove.list.TIntList; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BetterReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BetterReversePageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java index 798be55a..6f6b02e1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BetterReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking; +package nu.marginalia.util.ranking; import com.zaxxer.hikari.HikariDataSource; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BetterStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BetterStandardPageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java index 497ac146..4457195b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BetterStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java @@ -1,8 +1,7 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking; +package nu.marginalia.util.ranking; import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.list.array.TIntArrayList; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BuggyReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BuggyReversePageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java index 1fd696ab..212ab2be 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BuggyReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking; +package nu.marginalia.util.ranking; import com.zaxxer.hikari.HikariDataSource; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BuggyStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BuggyStandardPageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java index c2bf65b4..a7069f24 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/BuggyStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking; +package nu.marginalia.util.ranking; import com.zaxxer.hikari.HikariDataSource; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/RankingAlgorithm.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java index ce63c0a6..9d01a7c0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking; +package nu.marginalia.util.ranking; import com.zaxxer.hikari.HikariDataSource; import gnu.trove.list.TIntList; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/old/OldReversePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/old/OldReversePageRankV2.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java index 54b88edc..17291e0e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/old/OldReversePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking.old; +package nu.marginalia.util.ranking.old; import com.zaxxer.hikari.HikariDataSource; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/old/StandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/old/StandardPageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java index 613a8aa2..ca3f419e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/old/StandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking.old; +package nu.marginalia.util.ranking.old; import com.zaxxer.hikari.HikariDataSource; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/DedupTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/DedupTool.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java index 9e0423cd..946e23ad 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/DedupTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; +package nu.marginalia.util.ranking.tool; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/PerusePageRankV2.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java index 7f525daf..bab534bf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/PerusePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java @@ -1,10 +1,9 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; +package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.hash.TIntDoubleHashMap; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; import gnu.trove.set.hash.TIntHashSet; @@ -13,9 +12,9 @@ import it.unimi.dsi.fastutil.ints.IntComparator; import lombok.AllArgsConstructor; import lombok.Data; import lombok.SneakyThrows; +import nu.marginalia.util.ranking.RankingAlgorithm; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.index.service.util.ranking.RankingAlgorithm; import org.jetbrains.annotations.NotNull; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/TestAcademiaRankTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/TestAcademiaRankTool.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java index 638e3f6d..eed3e492 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/TestAcademiaRankTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/TestAcademiaRankTool.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; +package nu.marginalia.util.ranking.tool; import lombok.SneakyThrows; +import nu.marginalia.util.ranking.AcademiaRank; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.index.service.util.ranking.AcademiaRank; import org.mariadb.jdbc.Driver; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/UpdateDomainRanksTool.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java index a78dae31..42d1aa36 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/UpdateDomainRanksTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; +package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; +import nu.marginalia.util.ranking.BuggyStandardPageRank; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/UpdateDomainRanksTool2.java rename to marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java index 4ac2600d..3d119dc2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; +package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; +import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawler/domain/RssCrawler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawler/domain/RssCrawler.java index 44de1344..67413eb3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawler/domain/RssCrawler.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawler/domain/RssCrawler.java @@ -10,7 +10,7 @@ import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.crawler.fetcher.HttpFetcher; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank; +import nu.marginalia.util.ranking.BuggyStandardPageRank; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jsoup.Jsoup; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java index 5865935a..5dff7ba9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java @@ -6,11 +6,10 @@ import com.google.common.hash.Hashing; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.list.array.TIntArrayList; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank; +import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import org.mariadb.jdbc.Driver; @@ -24,7 +23,6 @@ import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.util.*; -import java.util.stream.Stream; public class CrawlJobExtractorPageRankMain { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java index f12212ec..7c878afb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java @@ -1,12 +1,28 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.AbstractModule; +import com.google.inject.Provides; import com.google.inject.name.Names; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.index.model.RankingSettings; +import org.yaml.snakeyaml.Yaml; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; public class EdgeIndexModule extends AbstractModule { + + public void configure() { bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31); } + @Provides + public RankingSettings rankingSettings() { + Path dir = WmsaHome.get().resolve("conf/ranking-settings.yaml"); + return RankingSettings.from(dir); + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/RankingSettings.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/RankingSettings.java new file mode 100644 index 00000000..b0246b2f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/RankingSettings.java @@ -0,0 +1,28 @@ +package nu.marginalia.wmsa.edge.index.model; + +import lombok.ToString; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.yaml.snakeyaml.Yaml; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +@ToString +public class RankingSettings { + public List small; + public List retro; + public List standard; + public List academia; + + public static RankingSettings from(Path dir) { + try { + return new Yaml().loadAs(Files.readString(dir), RankingSettings.class); + } + catch (IOException ex) { + throw new RuntimeException("Failed to load " + dir, ex); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java index 0ecf8f42..c6a5cfe4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java @@ -7,20 +7,27 @@ import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; +import nu.marginalia.util.ranking.BetterReversePageRank; +import nu.marginalia.util.ranking.BetterStandardPageRank; +import nu.marginalia.util.ranking.BuggyStandardPageRank; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.index.service.util.ranking.*; +import nu.marginalia.wmsa.edge.index.model.RankingSettings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @Singleton public class SearchIndexDao { private final HikariDataSource dataSource; + private RankingSettings rankingSettings; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public SearchIndexDao(HikariDataSource dataSource) + public SearchIndexDao(HikariDataSource dataSource, + RankingSettings rankingSettings) { this.dataSource = dataSource; + this.rankingSettings = rankingSettings; + logger.info("SearchIndexDao ranking settings = {}", rankingSettings); } @SneakyThrows @@ -71,14 +78,14 @@ public class SearchIndexDao { } @SneakyThrows - public TIntList getDomainsByRealPageRank() { - var spr = new BetterStandardPageRank(dataSource,"www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net", "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com"); + public TIntList getRetroDomains() { + var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new)); return spr.pageRankWithPeripheralNodes(spr.size()/2, false); } @SneakyThrows public TIntList getSmallWebDomains() { - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new)); rpr.setMaxKnownUrls(750); @@ -87,13 +94,13 @@ public class SearchIndexDao { @SneakyThrows public TIntList getAcademiaDomains() { - var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu"); + var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new)); return spr.pageRankWithPeripheralNodes(spr.size()/2, false); } @SneakyThrows - public TIntList getDomainsByStandardPageRank() { - var spr = new BuggyStandardPageRank(dataSource,"memex.marginalia.nu"); + public TIntList getStandardDomains() { + var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new)); return spr.pageRankWithPeripheralNodes(spr.size()/2, false); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java index b8c93f24..cf281116 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java @@ -23,7 +23,7 @@ public class SearchIndexPartitioner { private SearchEngineRanking retroRanking = null; private SearchEngineRanking smallWebRanking = null; - private SearchEngineRanking prWebRanking = null; + private SearchEngineRanking standardRanking = null; private SearchEngineRanking specialDomainRanking = null; private SearchEngineRanking academiaRanking = null; @@ -69,16 +69,16 @@ public class SearchIndexPartitioner { logger.info("Fetching domains"); - var retroDomains = dao.getDomainsByRealPageRank(); + var retroDomains = dao.getRetroDomains(); var smallWebDomains = dao.getSmallWebDomains(); var academiaDomains = dao.getAcademiaDomains(); - var prWebDomains = dao.getDomainsByStandardPageRank(); + var standardDomains = dao.getStandardDomains(); var specialDomains = dao.getSpecialDomains(); logger.info("Got {} retro domains", retroDomains.size()); logger.info("Got {} small domains", smallWebDomains.size()); logger.info("Got {} academia domains", academiaDomains.size()); - logger.info("Got {} corpo domains", prWebDomains.size()); + logger.info("Got {} standard domains", standardDomains.size()); logger.info("Got {} special domains", specialDomains.size()); var lock = rwl.writeLock(); @@ -87,7 +87,7 @@ public class SearchIndexPartitioner { retroRanking = new SearchEngineRanking(0, retroDomains, 0.2, 1); smallWebRanking = new SearchEngineRanking(2, smallWebDomains, 0.15); academiaRanking = new SearchEngineRanking(3, academiaDomains, 1); - prWebRanking = new SearchEngineRanking(4, prWebDomains, 0.2, 1); + standardRanking = new SearchEngineRanking(4, standardDomains, 0.2, 1); specialDomainRanking = new SearchEngineRanking(6, specialDomains, 1); logger.info("Finished building partitions table"); } @@ -112,7 +112,7 @@ public class SearchIndexPartitioner { return true; if (academiaRanking.hasBucket(bucketId, domainId)) return true; - if (prWebRanking.hasBucket(bucketId, domainId)) + if (standardRanking.hasBucket(bucketId, domainId)) return true; if (specialDomainRanking.hasBucket(bucketId, domainId)) return true; @@ -150,8 +150,8 @@ public class SearchIndexPartitioner { if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) { return academiaRanking.translateId(id); } - if (prWebRanking != null && prWebRanking.ownsBucket(bucketId)) { - return prWebRanking.translateId(id); + if (standardRanking != null && standardRanking.ownsBucket(bucketId)) { + return standardRanking.translateId(id); } if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) { return specialDomainRanking.translateId(id); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java index fafa68f1..bb946238 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java @@ -6,6 +6,7 @@ import lombok.SneakyThrows; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.index.model.RankingSettings; import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.mariadb.jdbc.Driver; @@ -58,7 +59,7 @@ public class IndexMergerMain { } var hikari = new DatabaseModule().provideConnection(); - var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari)); + var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings())); var blacklist = new EdgeDomainBlacklistImpl(hikari); new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawler/domain/language/processing/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawler/domain/language/processing/SentenceExtractorTest.java index ad71e526..b6b48dfd 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawler/domain/language/processing/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawler/domain/language/processing/SentenceExtractorTest.java @@ -9,8 +9,8 @@ import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels; import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordRep; import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordSpan; import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.tag.WordSeparator; -import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyReversePageRank; -import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank; +import nu.marginalia.util.ranking.BuggyReversePageRank; +import nu.marginalia.util.ranking.BuggyStandardPageRank; import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/RankingSettingsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/RankingSettingsTest.java new file mode 100644 index 00000000..eac47334 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/RankingSettingsTest.java @@ -0,0 +1,49 @@ +package nu.marginalia.wmsa.edge.index.model; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class RankingSettingsTest { + + Path tempFile; + @BeforeEach + void setUp() throws IOException { + tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp"); + } + + @AfterEach + void tearDown() throws IOException { + Files.delete(tempFile); + } + + @Test + void testParseRankingSettings() throws IOException { + Files.writeString(tempFile, """ + retro: + - "www.rep.routledge.com" + - "www.personal.kent.edu" + small: + - "bikobatanari.art" + - "wiki.xxiivv.com" + academia: + - "%edu" + standard: + - "memex.marginalia.nu" + """); + + var settings = RankingSettings.from(tempFile); + assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro); + assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small); + assertEquals(List.of("%edu"), settings.academia); + assertEquals(List.of("memex.marginalia.nu"), settings.standard); + + } +} \ No newline at end of file