From 569520c9b6cbfb36fb9201b787de6769fac31ef9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 21 Jan 2025 15:07:43 +0100 Subject: [PATCH] (index) Add manual adjustments for rankings based on domain --- .../index/results/DomainRankingOverrides.java | 119 ++++++++++++++++++ .../results/IndexResultRankingService.java | 7 +- .../results/IndexResultScoreCalculator.java | 10 +- .../results/DomainRankingOverridesTest.java | 103 +++++++++++++++ 4 files changed, 234 insertions(+), 5 deletions(-) create mode 100644 code/index/java/nu/marginalia/index/results/DomainRankingOverrides.java create mode 100644 code/index/test/nu/marginalia/index/results/DomainRankingOverridesTest.java diff --git a/code/index/java/nu/marginalia/index/results/DomainRankingOverrides.java b/code/index/java/nu/marginalia/index/results/DomainRankingOverrides.java new file mode 100644 index 00000000..c06bea1a --- /dev/null +++ b/code/index/java/nu/marginalia/index/results/DomainRankingOverrides.java @@ -0,0 +1,119 @@ +package nu.marginalia.index.results; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import gnu.trove.map.hash.TIntDoubleHashMap; +import nu.marginalia.WmsaHome; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.model.EdgeDomain; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.OptionalInt; +import java.util.concurrent.TimeUnit; + +@Singleton +public class DomainRankingOverrides { + private final DbDomainQueries domainQueries; + + private volatile TIntDoubleHashMap rankingFactors = new TIntDoubleHashMap(100, 0.75f, -1, 1.); + + private static final Logger logger = LoggerFactory.getLogger(DomainRankingOverrides.class); + + private final Path overrideFilePath; + + @Inject + public DomainRankingOverrides(DbDomainQueries domainQueries) { + this.domainQueries = domainQueries; + + overrideFilePath = WmsaHome.getDataPath().resolve("domain-ranking-factors.txt"); + + Thread.ofPlatform().start(this::updateRunner); + } + + // for test access + public DomainRankingOverrides(DbDomainQueries domainQueries, Path overrideFilePath) + { + this.domainQueries = domainQueries; + this.overrideFilePath = overrideFilePath; + } + + + public double getRankingFactor(int domainId) { + return rankingFactors.get(domainId); + } + + private void updateRunner() { + for (;;) { + reloadFile(); + + try { + TimeUnit.MINUTES.sleep(5); + } catch (InterruptedException ex) { + logger.warn("Thread interrupted", ex); + break; + } + } + } + + void reloadFile() { + if (!Files.exists(overrideFilePath)) { + return; + } + + try { + List lines = Files.readAllLines(overrideFilePath); + + double factor = 1.; + + var newRankingFactors = new TIntDoubleHashMap(lines.size(), 0.75f, -1, 1.); + + for (var line : lines) { + if (line.isBlank()) continue; + if (line.startsWith("#")) continue; + + String[] parts = line.split("\\s+"); + if (parts.length != 2) { + logger.warn("Unrecognized format for domain overrides file: {}", line); + continue; + } + + try { + switch (parts[0]) { + case "value" -> { + // error handle me + factor = Double.parseDouble(parts[1]); + if (factor < 0) { + logger.error("Negative values are not permitted, found {}", factor); + factor = 1; + } + } + case "domain" -> { + // error handle + OptionalInt domainId = domainQueries.tryGetDomainId(new EdgeDomain(parts[1])); + if (domainId.isPresent()) { + newRankingFactors.put(domainId.getAsInt(), factor); + } + else { + logger.warn("Unrecognized domain id {}", parts[1]); + } + } + default -> { + logger.warn("Unrecognized format {}", line); + } + } + } catch (Exception ex) { + logger.warn("Error in parsing domain overrides file: {} ({})", line, ex.getClass().getSimpleName()); + } + } + + rankingFactors = newRankingFactors; + } catch (IOException ex) { + logger.error("Failed to read " + overrideFilePath, ex); + } + } +} diff --git a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java index 9920b4da..67b3bd0b 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java @@ -40,13 +40,16 @@ public class IndexResultRankingService { private final DocumentDbReader documentDbReader; private final StatefulIndex statefulIndex; + private final DomainRankingOverrides domainRankingOverrides; @Inject public IndexResultRankingService(DocumentDbReader documentDbReader, - StatefulIndex statefulIndex) + StatefulIndex statefulIndex, + DomainRankingOverrides domainRankingOverrides) { this.documentDbReader = documentDbReader; this.statefulIndex = statefulIndex; + this.domainRankingOverrides = domainRankingOverrides; } public List rankResults(SearchParameters params, @@ -57,7 +60,7 @@ public class IndexResultRankingService { if (resultIds.isEmpty()) return List.of(); - IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, rankingContext, params); + IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext, params); List results = new ArrayList<>(resultIds.size()); diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 72e77a19..c6dc6775 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -41,14 +41,17 @@ public class IndexResultScoreCalculator { private final CombinedIndexReader index; private final QueryParams queryParams; + private final DomainRankingOverrides domainRankingOverrides; private final ResultRankingContext rankingContext; private final CompiledQuery compiledQuery; public IndexResultScoreCalculator(StatefulIndex statefulIndex, + DomainRankingOverrides domainRankingOverrides, ResultRankingContext rankingContext, SearchParameters params) { this.index = statefulIndex.get(); + this.domainRankingOverrides = domainRankingOverrides; this.rankingContext = rankingContext; this.queryParams = params.queryParams; @@ -127,10 +130,10 @@ public class IndexResultScoreCalculator { * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext)) / (Math.sqrt(unorderedMatches.searchableKeywordCount + 1)); + double rankingAdjustment = domainRankingOverrides.getRankingFactor(UrlIdCodec.getDomainId(combinedId)); + double score = normalize( - score_firstPosition + score_proximity + score_verbatim - + score_bM25 - + score_bFlags, + rankingAdjustment * (score_firstPosition + score_proximity + score_verbatim + score_bM25 + score_bFlags), -Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0 ); @@ -580,3 +583,4 @@ public class IndexResultScoreCalculator { } } + diff --git a/code/index/test/nu/marginalia/index/results/DomainRankingOverridesTest.java b/code/index/test/nu/marginalia/index/results/DomainRankingOverridesTest.java new file mode 100644 index 00000000..908fb705 --- /dev/null +++ b/code/index/test/nu/marginalia/index/results/DomainRankingOverridesTest.java @@ -0,0 +1,103 @@ +package nu.marginalia.index.results; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.test.TestMigrationLoader; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.sql.SQLException; + +@Testcontainers +@Execution(ExecutionMode.SAME_THREAD) +@Tag("slow") +class DomainRankingOverridesTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withNetworkAliases("mariadb"); + + private static DbDomainQueries domainQueries; + + @BeforeAll + public static void setup() throws SQLException { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + var dataSource = new HikariDataSource(config); + + TestMigrationLoader.flywayMigration(dataSource); + + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement()) { + stmt.executeQuery("DELETE FROM EC_DOMAIN"); // Wipe any old state from other test runs + + stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('first.example.com', 'example.com', 1)"); + stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('second.example.com', 'example.com', 1)"); + stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('third.example.com', 'example.com', 1)"); + stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('not-added.example.com', 'example.com', 1)"); + } + + domainQueries = new DbDomainQueries(dataSource); + + } + + @Test + public void test() throws IOException { + + Path overridesFile = Files.createTempFile(getClass().getSimpleName(), ".txt"); + try { + + Files.writeString(overridesFile, """ + # A comment + value 0.75 + domain first.example.com + domain second.example.com + + value 1.1 + domain third.example.com + """, + StandardOpenOption.APPEND); + + var overrides = new DomainRankingOverrides(domainQueries, overridesFile); + + overrides.reloadFile(); + + Assertions.assertEquals(0.75, overrides.getRankingFactor( + domainQueries.getDomainId(new EdgeDomain("first.example.com")) + )); + Assertions.assertEquals(0.75, overrides.getRankingFactor( + domainQueries.getDomainId(new EdgeDomain("second.example.com")) + )); + Assertions.assertEquals(1.1, overrides.getRankingFactor( + domainQueries.getDomainId(new EdgeDomain("third.example.com")) + )); + Assertions.assertEquals(1.0, overrides.getRankingFactor( + domainQueries.getDomainId(new EdgeDomain("not-added.example.com")) + )); + Assertions.assertEquals(1.0, overrides.getRankingFactor(1<<23)); + + } + finally { + Files.deleteIfExists(overridesFile); + } + } + +} \ No newline at end of file