mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(index) Add manual adjustments for rankings based on domain
This commit is contained in:
parent
088310e998
commit
569520c9b6
@ -0,0 +1,119 @@
|
|||||||
|
package nu.marginalia.index.results;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import gnu.trove.map.hash.TIntDoubleHashMap;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.OptionalInt;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class DomainRankingOverrides {
|
||||||
|
private final DbDomainQueries domainQueries;
|
||||||
|
|
||||||
|
private volatile TIntDoubleHashMap rankingFactors = new TIntDoubleHashMap(100, 0.75f, -1, 1.);
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomainRankingOverrides.class);
|
||||||
|
|
||||||
|
private final Path overrideFilePath;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomainRankingOverrides(DbDomainQueries domainQueries) {
|
||||||
|
this.domainQueries = domainQueries;
|
||||||
|
|
||||||
|
overrideFilePath = WmsaHome.getDataPath().resolve("domain-ranking-factors.txt");
|
||||||
|
|
||||||
|
Thread.ofPlatform().start(this::updateRunner);
|
||||||
|
}
|
||||||
|
|
||||||
|
// for test access
|
||||||
|
public DomainRankingOverrides(DbDomainQueries domainQueries, Path overrideFilePath)
|
||||||
|
{
|
||||||
|
this.domainQueries = domainQueries;
|
||||||
|
this.overrideFilePath = overrideFilePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getRankingFactor(int domainId) {
|
||||||
|
return rankingFactors.get(domainId);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateRunner() {
|
||||||
|
for (;;) {
|
||||||
|
reloadFile();
|
||||||
|
|
||||||
|
try {
|
||||||
|
TimeUnit.MINUTES.sleep(5);
|
||||||
|
} catch (InterruptedException ex) {
|
||||||
|
logger.warn("Thread interrupted", ex);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void reloadFile() {
|
||||||
|
if (!Files.exists(overrideFilePath)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
List<String> lines = Files.readAllLines(overrideFilePath);
|
||||||
|
|
||||||
|
double factor = 1.;
|
||||||
|
|
||||||
|
var newRankingFactors = new TIntDoubleHashMap(lines.size(), 0.75f, -1, 1.);
|
||||||
|
|
||||||
|
for (var line : lines) {
|
||||||
|
if (line.isBlank()) continue;
|
||||||
|
if (line.startsWith("#")) continue;
|
||||||
|
|
||||||
|
String[] parts = line.split("\\s+");
|
||||||
|
if (parts.length != 2) {
|
||||||
|
logger.warn("Unrecognized format for domain overrides file: {}", line);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
switch (parts[0]) {
|
||||||
|
case "value" -> {
|
||||||
|
// error handle me
|
||||||
|
factor = Double.parseDouble(parts[1]);
|
||||||
|
if (factor < 0) {
|
||||||
|
logger.error("Negative values are not permitted, found {}", factor);
|
||||||
|
factor = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "domain" -> {
|
||||||
|
// error handle
|
||||||
|
OptionalInt domainId = domainQueries.tryGetDomainId(new EdgeDomain(parts[1]));
|
||||||
|
if (domainId.isPresent()) {
|
||||||
|
newRankingFactors.put(domainId.getAsInt(), factor);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
logger.warn("Unrecognized domain id {}", parts[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default -> {
|
||||||
|
logger.warn("Unrecognized format {}", line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.warn("Error in parsing domain overrides file: {} ({})", line, ex.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rankingFactors = newRankingFactors;
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.error("Failed to read " + overrideFilePath, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -40,13 +40,16 @@ public class IndexResultRankingService {
|
|||||||
|
|
||||||
private final DocumentDbReader documentDbReader;
|
private final DocumentDbReader documentDbReader;
|
||||||
private final StatefulIndex statefulIndex;
|
private final StatefulIndex statefulIndex;
|
||||||
|
private final DomainRankingOverrides domainRankingOverrides;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IndexResultRankingService(DocumentDbReader documentDbReader,
|
public IndexResultRankingService(DocumentDbReader documentDbReader,
|
||||||
StatefulIndex statefulIndex)
|
StatefulIndex statefulIndex,
|
||||||
|
DomainRankingOverrides domainRankingOverrides)
|
||||||
{
|
{
|
||||||
this.documentDbReader = documentDbReader;
|
this.documentDbReader = documentDbReader;
|
||||||
this.statefulIndex = statefulIndex;
|
this.statefulIndex = statefulIndex;
|
||||||
|
this.domainRankingOverrides = domainRankingOverrides;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<SearchResultItem> rankResults(SearchParameters params,
|
public List<SearchResultItem> rankResults(SearchParameters params,
|
||||||
@ -57,7 +60,7 @@ public class IndexResultRankingService {
|
|||||||
if (resultIds.isEmpty())
|
if (resultIds.isEmpty())
|
||||||
return List.of();
|
return List.of();
|
||||||
|
|
||||||
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, rankingContext, params);
|
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext, params);
|
||||||
|
|
||||||
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
||||||
|
|
||||||
|
@ -41,14 +41,17 @@ public class IndexResultScoreCalculator {
|
|||||||
private final CombinedIndexReader index;
|
private final CombinedIndexReader index;
|
||||||
private final QueryParams queryParams;
|
private final QueryParams queryParams;
|
||||||
|
|
||||||
|
private final DomainRankingOverrides domainRankingOverrides;
|
||||||
private final ResultRankingContext rankingContext;
|
private final ResultRankingContext rankingContext;
|
||||||
private final CompiledQuery<String> compiledQuery;
|
private final CompiledQuery<String> compiledQuery;
|
||||||
|
|
||||||
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
|
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
|
||||||
|
DomainRankingOverrides domainRankingOverrides,
|
||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
SearchParameters params)
|
SearchParameters params)
|
||||||
{
|
{
|
||||||
this.index = statefulIndex.get();
|
this.index = statefulIndex.get();
|
||||||
|
this.domainRankingOverrides = domainRankingOverrides;
|
||||||
this.rankingContext = rankingContext;
|
this.rankingContext = rankingContext;
|
||||||
|
|
||||||
this.queryParams = params.queryParams;
|
this.queryParams = params.queryParams;
|
||||||
@ -127,10 +130,10 @@ public class IndexResultScoreCalculator {
|
|||||||
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext))
|
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext))
|
||||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||||
|
|
||||||
|
double rankingAdjustment = domainRankingOverrides.getRankingFactor(UrlIdCodec.getDomainId(combinedId));
|
||||||
|
|
||||||
double score = normalize(
|
double score = normalize(
|
||||||
score_firstPosition + score_proximity + score_verbatim
|
rankingAdjustment * (score_firstPosition + score_proximity + score_verbatim + score_bM25 + score_bFlags),
|
||||||
+ score_bM25
|
|
||||||
+ score_bFlags,
|
|
||||||
-Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
|
-Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -580,3 +583,4 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,103 @@
|
|||||||
|
package nu.marginalia.index.results;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.test.TestMigrationLoader;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
|
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||||
|
import org.testcontainers.containers.MariaDBContainer;
|
||||||
|
import org.testcontainers.junit.jupiter.Container;
|
||||||
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
@Testcontainers
|
||||||
|
@Execution(ExecutionMode.SAME_THREAD)
|
||||||
|
@Tag("slow")
|
||||||
|
class DomainRankingOverridesTest {
|
||||||
|
@Container
|
||||||
|
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||||
|
.withDatabaseName("WMSA_prod")
|
||||||
|
.withUsername("wmsa")
|
||||||
|
.withPassword("wmsa")
|
||||||
|
.withNetworkAliases("mariadb");
|
||||||
|
|
||||||
|
private static DbDomainQueries domainQueries;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setup() throws SQLException {
|
||||||
|
HikariConfig config = new HikariConfig();
|
||||||
|
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||||
|
config.setUsername("wmsa");
|
||||||
|
config.setPassword("wmsa");
|
||||||
|
|
||||||
|
var dataSource = new HikariDataSource(config);
|
||||||
|
|
||||||
|
TestMigrationLoader.flywayMigration(dataSource);
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.createStatement()) {
|
||||||
|
stmt.executeQuery("DELETE FROM EC_DOMAIN"); // Wipe any old state from other test runs
|
||||||
|
|
||||||
|
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('first.example.com', 'example.com', 1)");
|
||||||
|
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('second.example.com', 'example.com', 1)");
|
||||||
|
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('third.example.com', 'example.com', 1)");
|
||||||
|
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('not-added.example.com', 'example.com', 1)");
|
||||||
|
}
|
||||||
|
|
||||||
|
domainQueries = new DbDomainQueries(dataSource);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws IOException {
|
||||||
|
|
||||||
|
Path overridesFile = Files.createTempFile(getClass().getSimpleName(), ".txt");
|
||||||
|
try {
|
||||||
|
|
||||||
|
Files.writeString(overridesFile, """
|
||||||
|
# A comment
|
||||||
|
value 0.75
|
||||||
|
domain first.example.com
|
||||||
|
domain second.example.com
|
||||||
|
|
||||||
|
value 1.1
|
||||||
|
domain third.example.com
|
||||||
|
""",
|
||||||
|
StandardOpenOption.APPEND);
|
||||||
|
|
||||||
|
var overrides = new DomainRankingOverrides(domainQueries, overridesFile);
|
||||||
|
|
||||||
|
overrides.reloadFile();
|
||||||
|
|
||||||
|
Assertions.assertEquals(0.75, overrides.getRankingFactor(
|
||||||
|
domainQueries.getDomainId(new EdgeDomain("first.example.com"))
|
||||||
|
));
|
||||||
|
Assertions.assertEquals(0.75, overrides.getRankingFactor(
|
||||||
|
domainQueries.getDomainId(new EdgeDomain("second.example.com"))
|
||||||
|
));
|
||||||
|
Assertions.assertEquals(1.1, overrides.getRankingFactor(
|
||||||
|
domainQueries.getDomainId(new EdgeDomain("third.example.com"))
|
||||||
|
));
|
||||||
|
Assertions.assertEquals(1.0, overrides.getRankingFactor(
|
||||||
|
domainQueries.getDomainId(new EdgeDomain("not-added.example.com"))
|
||||||
|
));
|
||||||
|
Assertions.assertEquals(1.0, overrides.getRankingFactor(1<<23));
|
||||||
|
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
Files.deleteIfExists(overridesFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user