Add parameters to the ranking and search set configurations.

This commit is contained in:
Viktor Lofgren 2023-02-13 17:07:33 +01:00
parent b92d18521d
commit d6b02f6669
5 changed files with 54 additions and 26 deletions

View File

@ -2,9 +2,8 @@ package nu.marginalia.wmsa.edge.index;
import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import com.google.inject.name.Names;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import nu.marginalia.wmsa.edge.index.config.RankingSettings;
import java.nio.file.Path;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.model;
package nu.marginalia.wmsa.edge.index.config;
import lombok.ToString;
import org.yaml.snakeyaml.Yaml;
@ -10,10 +10,11 @@ import java.util.List;
@ToString
public class RankingSettings {
public List<String> small;
public List<String> retro;
public List<String> standard;
public List<String> academia;
public RankingSettingsEntry small;
public RankingSettingsEntry retro;
public RankingSettingsEntry standard;
public RankingSettingsEntry academia;
public RankingSettingsEntry ranking;
public static RankingSettings from(Path dir) {
try {

View File

@ -0,0 +1,8 @@
package nu.marginalia.wmsa.edge.index.config;
import java.util.List;
public class RankingSettingsEntry {
public List<String> domains;
public int max;
}

View File

@ -9,7 +9,7 @@ import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAcc
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import nu.marginalia.wmsa.edge.index.config.RankingSettings;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
@ -86,9 +86,11 @@ public class EdgeIndexSearchSetsService {
}
private void updateDomainRankings() {
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
var entry = rankingSettings.academia;
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var ranks = spr.pageRankWithPeripheralNodes(entry.max, () -> new RankingResultHashMapAccumulator(100_000));
var ranks = spr.pageRankWithPeripheralNodes(Math.min(100_000, spr.size() / 2), () -> new RankingResultHashMapAccumulator(100_000));
synchronized (this) {
domainRankings = new DomainRankings(ranks);
}
@ -96,8 +98,10 @@ public class EdgeIndexSearchSetsService {
@SneakyThrows
public void updateRetroDomainsSet() {
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(Math.min(50_000, spr.size()), RankingResultBitSetAccumulator::new);
var entry = rankingSettings.retro;
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
synchronized (this) {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
@ -107,9 +111,11 @@ public class EdgeIndexSearchSetsService {
@SneakyThrows
public void updateSmallWebDomainsSet() {
var rpr = new ReversePageRank(similarityDomains, rankingSettings.small.toArray(String[]::new));
var entry = rankingSettings.small;
var rpr = new ReversePageRank(similarityDomains, entry.domains.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
var data = rpr.pageRankWithPeripheralNodes(Math.min(10_000, rpr.size()), RankingResultBitSetAccumulator::new);
var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
synchronized (this) {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
@ -119,8 +125,10 @@ public class EdgeIndexSearchSetsService {
@SneakyThrows
public void updateAcademiaDomainsSet() {
var spr = new StandardPageRank(similarityDomains, rankingSettings.academia.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(Math.min(15_000, spr.size()/2), RankingResultBitSetAccumulator::new);
var entry = rankingSettings.academia;
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
synchronized (this) {
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.index.model;
import nu.marginalia.wmsa.edge.index.config.RankingSettings;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -28,22 +29,33 @@ class RankingSettingsTest {
void testParseRankingSettings() throws IOException {
Files.writeString(tempFile, """
retro:
max: 50
domains:
- "www.rep.routledge.com"
- "www.personal.kent.edu"
small:
max: 10
domains:
- "bikobatanari.art"
- "wiki.xxiivv.com"
academia:
max: 101
domains:
- "%edu"
standard:
max: 23
domains:
- "memex.marginalia.nu"
""");
var settings = RankingSettings.from(tempFile);
assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro);
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small);
assertEquals(List.of("%edu"), settings.academia);
assertEquals(List.of("memex.marginalia.nu"), settings.standard);
assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro.domains);
assertEquals(50, settings.retro.max);
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small.domains);
assertEquals(10, settings.small.max);
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small.domains);
assertEquals(List.of("%edu"), settings.academia.domains);
assertEquals(List.of("memex.marginalia.nu"), settings.standard.domains);
}
}