mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Reintroduce the ability to filter search results by their ranking.
This commit is contained in:
parent
4a07eda61c
commit
04f905f3a1
@ -1,39 +0,0 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
public class BuggyReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
RankVector createNewRankVector(RankVector rank) {
|
||||
|
||||
double rankNorm = rank.norm();
|
||||
RankVector newRank = new RankVector(0);
|
||||
|
||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
||||
|
||||
var links = linkDataSrc2Dest[domainId];
|
||||
|
||||
if (links != null && links.size() > 0) {
|
||||
double newRankValue = 0;
|
||||
|
||||
for (int j = 0; j < links.size(); j++) {
|
||||
newRankValue += rank.get(links.getQuick(j)) / links.size();
|
||||
}
|
||||
|
||||
newRank.set(domainId, 0.85*newRankValue/rankNorm);
|
||||
}
|
||||
}
|
||||
return newRank;
|
||||
}
|
||||
|
||||
@Override
|
||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
||||
originDomainIds.forEach(id -> vector.increment(domainIdToIndex.get(id), dNorm/oldNorm));
|
||||
}
|
||||
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
public class BuggyStandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
RankingAlgorithm.RankVector createNewRankVector(RankingAlgorithm.RankVector rank) {
|
||||
RankVector newRank = new RankVector(0);
|
||||
|
||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
||||
|
||||
var links = linkDataSrc2Dest[domainId];
|
||||
double newRankValue = 0;
|
||||
|
||||
if (links != null && links.size() > 0) {
|
||||
for (int j = 0; j < links.size(); j++) {
|
||||
int linkedDomain = links.getQuick(j);
|
||||
|
||||
int linkSize = 1;
|
||||
var bl = linkDataSrc2Dest[linkedDomain];
|
||||
if (bl != null) {
|
||||
linkSize = bl.size();
|
||||
}
|
||||
|
||||
newRankValue += rank.get(linkedDomain) / linkSize;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
newRank.set(domainId, 0.85 * newRankValue);
|
||||
}
|
||||
return newRank;
|
||||
}
|
||||
|
||||
@Override
|
||||
void adjustRankVector(RankingAlgorithm.RankVector vector, double dNorm, double oldNorm) {
|
||||
originDomainIds.forEach(id -> vector.increment(id, dNorm/originDomainIds.size()));
|
||||
vector.incrementAll(0.14*dNorm/vector.size());
|
||||
}
|
||||
|
||||
}
|
@ -1,89 +0,0 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class DedupTool {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DedupTool.class);
|
||||
|
||||
public Set<String> originDomains = new HashSet<>();
|
||||
public Set<Integer> originDomainIds = new HashSet<>();
|
||||
public final long domainIdMax = -1;
|
||||
public int domainCount;
|
||||
private volatile static int rankMax;
|
||||
|
||||
public int maxId() {
|
||||
return (int) domainIdMax;
|
||||
}
|
||||
public int domainCount() {
|
||||
return domainCount;
|
||||
}
|
||||
|
||||
static LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
volatile static boolean running = true;
|
||||
|
||||
@AllArgsConstructor @ToString @Getter
|
||||
static class Data {
|
||||
String url;
|
||||
int id;
|
||||
String domain;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
Driver driver = new Driver();
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
|
||||
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
|
||||
|
||||
try (var conn = ds.getConnection();
|
||||
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
||||
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
|
||||
|
||||
) {
|
||||
fetchStmt.setFetchSize(10_000);
|
||||
var rsp = fetchStmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
domainToHashToUrl.computeIfAbsent(rsp.getInt(1), i -> new HashMap<>())
|
||||
.computeIfAbsent(rsp.getInt(2), i -> new ArrayList<>()).add(new Data(rsp.getString(3), rsp.getInt(4), rsp.getString(5)));
|
||||
}
|
||||
|
||||
|
||||
List<Integer> updateIds = new ArrayList<>();
|
||||
|
||||
domainToHashToUrl.forEach((domain, hashes) -> {
|
||||
hashes.forEach((hash, urls) -> {
|
||||
if (urls.size() > 1) {
|
||||
Comparator<Data> c = Comparator.comparing(d -> d.domain.length());
|
||||
var urls2 = urls.stream().sorted(c.thenComparing(d -> d.url.length()))
|
||||
.collect(Collectors.partitioningBy(d -> d.url.endsWith("/")));
|
||||
|
||||
Stream
|
||||
.concat(urls2.get(true).stream(),urls2.get(false).stream()).skip(1)
|
||||
.map(Data::getId)
|
||||
.forEach(updateIds::add);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
for (int id : updateIds) {
|
||||
updateStmt.setInt(1, id);
|
||||
updateStmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,93 +0,0 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public class UpdateDomainRanksTool {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
|
||||
|
||||
public Set<String> originDomains = new HashSet<>();
|
||||
public Set<Integer> originDomainIds = new HashSet<>();
|
||||
public final long domainIdMax = -1;
|
||||
public int domainCount;
|
||||
private volatile static int rankMax;
|
||||
|
||||
public int maxId() {
|
||||
return (int) domainIdMax;
|
||||
}
|
||||
public int domainCount() {
|
||||
return domainCount;
|
||||
}
|
||||
|
||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
volatile static boolean running = true;
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
org.mariadb.jdbc.Driver driver = new Driver();
|
||||
var conn = new DatabaseModule().provideConnection();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
||||
|
||||
logger.info("Ranking");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
|
||||
|
||||
rankMax = spr.size()*2;
|
||||
uploader.start();
|
||||
|
||||
var rankData = spr.pageRankWithPeripheralNodes(rankMax);
|
||||
for (int i : rankData) {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
running = false;
|
||||
uploader.join();
|
||||
|
||||
logger.info("Done in {}", (end - start)/1000.0);
|
||||
}
|
||||
|
||||
public static void uploadThread(HikariDataSource dataSource) {
|
||||
int i = 0;
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
logger.info("Resetting rank");
|
||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) {
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
|
||||
logger.info("Updating ranks");
|
||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) {
|
||||
while (running || (!running && !uploadQueue.isEmpty())) {
|
||||
var job = uploadQueue.take();
|
||||
stmt.setDouble(1, i++ / (double) rankMax);
|
||||
stmt.setInt(2, job);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
} catch (SQLException | InterruptedException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index;
|
||||
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@ -9,14 +10,16 @@ import java.io.IOException;
|
||||
public class EdgeIndexControl {
|
||||
|
||||
private final IndexServicesFactory servicesFactory;
|
||||
private final EdgeIndexSearchSetsService searchSetsService;
|
||||
|
||||
@Inject
|
||||
public EdgeIndexControl(IndexServicesFactory servicesFactory) {
|
||||
public EdgeIndexControl(IndexServicesFactory servicesFactory, EdgeIndexSearchSetsService searchSetsService) {
|
||||
this.servicesFactory = servicesFactory;
|
||||
this.searchSetsService = searchSetsService;
|
||||
}
|
||||
|
||||
public void regenerateIndex() throws IOException {
|
||||
servicesFactory.convertIndex();
|
||||
servicesFactory.convertIndex(searchSetsService.getDomainRankings());
|
||||
|
||||
System.gc();
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndex;
|
||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter;
|
||||
@ -20,6 +21,7 @@ import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter;
|
||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader;
|
||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters;
|
||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -33,7 +35,6 @@ import java.util.concurrent.Callable;
|
||||
@Singleton
|
||||
public class IndexServicesFactory {
|
||||
private final Path tmpFileDir;
|
||||
private final EdgeDomainBlacklist domainBlacklist;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@ -58,12 +59,10 @@ public class IndexServicesFactory {
|
||||
public IndexServicesFactory(
|
||||
@Named("tmp-file-dir") Path tmpFileDir,
|
||||
@Named("partition-root-slow") Path partitionRootSlow,
|
||||
@Named("partition-root-fast") Path partitionRootFast,
|
||||
EdgeDomainBlacklist domainBlacklist
|
||||
@Named("partition-root-fast") Path partitionRootFast
|
||||
) throws IOException {
|
||||
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
|
||||
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
|
||||
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat");
|
||||
@ -106,8 +105,8 @@ public class IndexServicesFactory {
|
||||
|
||||
}
|
||||
|
||||
public void convertIndex() throws IOException {
|
||||
convertForwardIndex();
|
||||
public void convertIndex(DomainRankings domainRankings) throws IOException {
|
||||
convertForwardIndex(domainRankings);
|
||||
convertFullReverseIndex();
|
||||
convertPriorityReverseIndex();
|
||||
|
||||
@ -148,13 +147,14 @@ public class IndexServicesFactory {
|
||||
tryGc();
|
||||
}
|
||||
|
||||
private void convertForwardIndex() throws IOException {
|
||||
private void convertForwardIndex(DomainRankings domainRankings) throws IOException {
|
||||
logger.info("Converting forward index data");
|
||||
|
||||
new ForwardIndexConverter(tmpFileDir,
|
||||
new ForwardIndexConverter(
|
||||
writerIndexFile.get(0),
|
||||
fwdIndexDocId.get(NEXT_PART).toPath(),
|
||||
fwdIndexDocData.get(NEXT_PART).toPath())
|
||||
fwdIndexDocData.get(NEXT_PART).toPath(),
|
||||
domainRankings)
|
||||
.convert();
|
||||
|
||||
tryGc();
|
||||
@ -212,8 +212,8 @@ public class IndexServicesFactory {
|
||||
}
|
||||
}
|
||||
|
||||
public SearchIndex createIndexBucket() {
|
||||
return new SearchIndex(this, new EdgeIndexControl(this));
|
||||
public SearchIndex createIndexBucket(EdgeIndexSearchSetsService searchSetsService) {
|
||||
return new SearchIndex(this, new EdgeIndexControl(this, searchSetsService));
|
||||
}
|
||||
|
||||
public SearchIndexReader getSearchIndexReader() throws IOException {
|
||||
|
@ -8,7 +8,8 @@ import java.util.Set;
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public record EdgePageDocumentsMetadata(int encSize,
|
||||
public record EdgePageDocumentsMetadata(int rank,
|
||||
int encSize,
|
||||
int topology,
|
||||
int year,
|
||||
int sets,
|
||||
@ -16,9 +17,13 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
byte flags) {
|
||||
|
||||
|
||||
public static final long RANK_MASK = 0xFFL;
|
||||
public static final int RANK_SHIFT = 48;
|
||||
|
||||
public static final long ENCSIZE_MASK = 0xFFL;
|
||||
public static final int ENCSIZE_SHIFT = 48;
|
||||
public static final int ENCSIZE_SHIFT = 40;
|
||||
public static final int ENCSIZE_MULTIPLIER = 50;
|
||||
|
||||
public static final long TOPOLOGY_MASK = 0xFFL;
|
||||
|
||||
public static final int TOPOLOGY_SHIFT = 32;
|
||||
@ -39,7 +44,7 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
this(defaultValue());
|
||||
}
|
||||
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
||||
this(0, topology, year, sets, quality, encodeFlags(flags));
|
||||
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
|
||||
}
|
||||
|
||||
public EdgePageDocumentsMetadata withSize(int size) {
|
||||
@ -49,7 +54,7 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
|
||||
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
|
||||
|
||||
return new EdgePageDocumentsMetadata(encSize, topology, year, sets, quality, flags);
|
||||
return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags);
|
||||
}
|
||||
|
||||
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
|
||||
@ -63,7 +68,8 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
}
|
||||
|
||||
public EdgePageDocumentsMetadata(long value) {
|
||||
this( (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
||||
this( (int) ((value >>> RANK_SHIFT) & RANK_MASK),
|
||||
(int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
||||
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
|
||||
(int) ((value >>> YEAR_SHIFT) & YEAR_MASK),
|
||||
(int) ((value >>> SETS_SHIFT) & SETS_MASK),
|
||||
@ -84,12 +90,13 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT;
|
||||
ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT;
|
||||
ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT;
|
||||
ret |= min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0;
|
||||
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0 && rank == 0;
|
||||
}
|
||||
|
||||
public static int decodeQuality(long encoded) {
|
||||
@ -112,6 +119,12 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK);
|
||||
}
|
||||
|
||||
public static int decodeRank(long encoded) {
|
||||
return (int) ((encoded >>> RANK_SHIFT) & RANK_MASK);
|
||||
}
|
||||
|
||||
public static long encodeRank(long encoded, int rank) {
|
||||
return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,43 @@
|
||||
package nu.marginalia.wmsa.edge.index.postings;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class DomainRankings {
|
||||
private final Int2ShortOpenHashMap rankings;
|
||||
|
||||
private final int MAX_MEANINGFUL_RANK = 50_000;
|
||||
private final int MAX_RANK_VALUE = 255;
|
||||
private final int MIN_RANK_VALUE = 1;
|
||||
private final double RANK_SCALING_FACTOR = (double) MAX_RANK_VALUE / MAX_MEANINGFUL_RANK;
|
||||
|
||||
public DomainRankings() {
|
||||
rankings = new Int2ShortOpenHashMap();
|
||||
}
|
||||
public DomainRankings(Int2IntOpenHashMap values) {
|
||||
rankings = new Int2ShortOpenHashMap(values.size());
|
||||
values.forEach(this::putRanking);
|
||||
}
|
||||
|
||||
private void putRanking(int domainId, int value) {
|
||||
rankings.put(domainId, scaleRank(value));
|
||||
}
|
||||
|
||||
private short scaleRank(int value) {
|
||||
double rankScaled = RANK_SCALING_FACTOR * value;
|
||||
return (short) min(MAX_RANK_VALUE, max(MIN_RANK_VALUE, rankScaled));
|
||||
}
|
||||
|
||||
public int getRanking(int domainId) {
|
||||
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return rankings.size();
|
||||
}
|
||||
}
|
@ -6,6 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -26,13 +27,14 @@ public class SearchIndexControl {
|
||||
|
||||
@Inject
|
||||
public SearchIndexControl(IndexServicesFactory servicesFactory,
|
||||
EdgeOpsLockService opsLockService) {
|
||||
EdgeOpsLockService opsLockService,
|
||||
EdgeIndexSearchSetsService searchSetsService) {
|
||||
this.servicesFactory = servicesFactory;
|
||||
|
||||
this.primaryIndexWriter = servicesFactory.getIndexWriter(0);
|
||||
this.secondaryIndexWriter = servicesFactory.getIndexWriter(1);
|
||||
|
||||
index = servicesFactory.createIndexBucket();
|
||||
index = servicesFactory.createIndexBucket(searchSetsService);
|
||||
this.opsLockService = opsLockService;
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.index.postings.forward;
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
||||
import org.roaringbitmap.IntConsumer;
|
||||
@ -18,26 +20,26 @@ import java.nio.file.Path;
|
||||
import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*;
|
||||
|
||||
public class ForwardIndexConverter {
|
||||
private static final int RWF_BIN_SIZE = 10_000_000;
|
||||
|
||||
private final Path tmpFileDir;
|
||||
private final File inputFile;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final Path outputFileDocsId;
|
||||
private final Path outputFileDocsData;
|
||||
private final DomainRankings domainRankings;
|
||||
|
||||
|
||||
public ForwardIndexConverter(Path tmpFileDir,
|
||||
public ForwardIndexConverter(
|
||||
File inputFile,
|
||||
Path outputFileDocsId,
|
||||
Path outputFileDocsData
|
||||
Path outputFileDocsData,
|
||||
DomainRankings domainRankings
|
||||
) {
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.inputFile = inputFile;
|
||||
this.outputFileDocsId = outputFileDocsId;
|
||||
this.outputFileDocsData = outputFileDocsData;
|
||||
this.domainRankings = domainRankings;
|
||||
}
|
||||
|
||||
public void convert() throws IOException {
|
||||
@ -50,6 +52,8 @@ public class ForwardIndexConverter {
|
||||
|
||||
logger.info("Converting {} {}",inputFile, journalReader.fileHeader);
|
||||
|
||||
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||
|
||||
try {
|
||||
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
|
||||
|
||||
@ -68,7 +72,10 @@ public class ForwardIndexConverter {
|
||||
journalReader.forEach(entry -> {
|
||||
long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
||||
|
||||
docFileData.set(entryOffset + METADATA_OFFSET, entry.docMeta());
|
||||
int ranking = domainRankings.getRanking(entry.domainId());
|
||||
long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking);
|
||||
|
||||
docFileData.set(entryOffset + METADATA_OFFSET, meta);
|
||||
docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId());
|
||||
});
|
||||
|
||||
|
@ -33,6 +33,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
if (!validateSize(post)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!validateRank(post)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -51,6 +56,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
|
||||
return limit.test(quality);
|
||||
}
|
||||
|
||||
private boolean validateYear(ForwardIndexReader.DocPost post) {
|
||||
if (params.year().type() == SpecificationLimitType.NONE)
|
||||
return true;
|
||||
@ -69,6 +75,15 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
return params.size().test(postVal);
|
||||
}
|
||||
|
||||
private boolean validateRank(ForwardIndexReader.DocPost post) {
|
||||
if (params.rank().type() == SpecificationLimitType.NONE)
|
||||
return true;
|
||||
|
||||
int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta());
|
||||
|
||||
return params.rank().test(postVal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double cost() {
|
||||
return 32;
|
||||
|
@ -53,6 +53,11 @@ public class ReverseIndexReader {
|
||||
}
|
||||
|
||||
public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) {
|
||||
if (null == words) {
|
||||
logger.warn("Reverse index is not ready, dropping query");
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
||||
|
||||
long offset = words.get(wordId);
|
||||
|
@ -7,6 +7,7 @@ import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
|
||||
public record IndexQueryParams(SpecificationLimit qualityLimit,
|
||||
SpecificationLimit year,
|
||||
SpecificationLimit size,
|
||||
SpecificationLimit rank,
|
||||
SearchSet searchSet,
|
||||
QueryStrategy queryStrategy
|
||||
)
|
||||
|
@ -1,21 +1,19 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.function.IntToDoubleFunction;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public abstract class RankingAlgorithm {
|
||||
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
||||
@ -133,29 +131,7 @@ public abstract class RankingAlgorithm {
|
||||
return domainsById.size();
|
||||
}
|
||||
|
||||
|
||||
public RankVector pageRankVector() {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
for (int i = 0; i < iter_max; i++) {
|
||||
RankVector newRank = createNewRankVector(rank);
|
||||
|
||||
double oldNorm = rank.norm();
|
||||
double newNorm = newRank.norm();
|
||||
double dNorm = oldNorm - newNorm ;
|
||||
if (i < iter_max-1) {
|
||||
adjustRankVector(newRank, dNorm, oldNorm);
|
||||
}
|
||||
|
||||
rank = newRank;
|
||||
}
|
||||
|
||||
return rank;
|
||||
}
|
||||
|
||||
|
||||
public RoaringBitmap pageRank(int resultCount) {
|
||||
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
@ -174,10 +150,10 @@ public abstract class RankingAlgorithm {
|
||||
}
|
||||
|
||||
|
||||
return rank.getRanking(resultCount);
|
||||
return rank.getRanking(resultCount, accumulatorP).get();
|
||||
}
|
||||
|
||||
public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) {
|
||||
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
@ -201,32 +177,11 @@ public abstract class RankingAlgorithm {
|
||||
|
||||
logger.info("PRWPN iteration done");
|
||||
|
||||
return rank.getRanking(resultCount);
|
||||
return rank.getRanking(resultCount, accumulatorP).get();
|
||||
}
|
||||
|
||||
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
|
||||
|
||||
public TIntList pageRank(IntToDoubleFunction weight, int resultCount) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
for (int i = 0; i < iter_max; i++) {
|
||||
RankVector newRank = createNewRankVector(rank);
|
||||
|
||||
double oldNorm = rank.norm();
|
||||
double newNorm = newRank.norm();
|
||||
double dNorm = oldNorm - newNorm ;
|
||||
|
||||
if (i < iter_max-1) {
|
||||
adjustRankVector(newRank, dNorm, oldNorm);
|
||||
}
|
||||
|
||||
rank = newRank;
|
||||
}
|
||||
|
||||
return rank.getRanking(weight, resultCount);
|
||||
}
|
||||
|
||||
abstract RankVector createNewRankVector(RankVector rank);
|
||||
|
||||
public boolean includeInRanking(RankingDomainData data) {
|
||||
@ -271,9 +226,8 @@ public abstract class RankingAlgorithm {
|
||||
|
||||
public double norm() {
|
||||
double v = 0.;
|
||||
for (int i = 0; i < rank.length; i++) {
|
||||
if (rank[i] > 0) { v+=rank[i]; }
|
||||
else { v -= rank[i]; }
|
||||
for (double value : rank) {
|
||||
v += Math.abs(value);
|
||||
}
|
||||
return v;
|
||||
}
|
||||
@ -281,73 +235,38 @@ public abstract class RankingAlgorithm {
|
||||
public double norm(RankVector other) {
|
||||
double v = 0.;
|
||||
for (int i = 0; i < rank.length; i++) {
|
||||
double dv = rank[i] - other.get(i);
|
||||
|
||||
if (dv > 0) { v+=dv; }
|
||||
else { v -= dv; }
|
||||
v += Math.abs(rank[i] - other.get(i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
public TIntList getRanking(IntToDoubleFunction other, int numResults) {
|
||||
TIntArrayList list = new TIntArrayList(numResults);
|
||||
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
|
||||
Comparator<Integer> comparator = Comparator.comparing(i -> Math.sqrt(other.applyAsDouble(domainIdToIndex.get(i)) * rank[i]));
|
||||
|
||||
IntStream.range(0, rank.length)
|
||||
.boxed()
|
||||
.sorted(comparator.reversed())
|
||||
.map(domainIndexToId::get)
|
||||
.limit(numResults)
|
||||
.forEach(list::add);
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
public RoaringBitmap getRanking(int numResults) {
|
||||
if (numResults < 0) {
|
||||
numResults = domainIdToIndex.size();
|
||||
}
|
||||
if (numResults >= rank.length) {
|
||||
numResults = rank.length;
|
||||
}
|
||||
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
|
||||
|
||||
RoaringBitmap list = new RoaringBitmap();
|
||||
int[] nodes = sortOrder(rank);
|
||||
var accumulator = accumulatorP.get();
|
||||
|
||||
int[] nodes = new int[rank.length];
|
||||
Arrays.setAll(nodes, i->i);
|
||||
IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]);
|
||||
IntArrays.quickSort(nodes, comp);
|
||||
|
||||
int i;
|
||||
|
||||
for (i = 0; i < numResults; i++) {
|
||||
for (int i = 0; i < numResults; i++) {
|
||||
int id = domainIndexToId.get(nodes[i]);
|
||||
|
||||
if (includeInRanking(domainsById.get(id)))
|
||||
list.add(id);
|
||||
accumulator.add(id, i);
|
||||
}
|
||||
|
||||
for (; i < nodes.length && domainsById.size() < numResults; i++) {
|
||||
int id = domainIndexToId.get(nodes[i]);
|
||||
|
||||
if (includeInRanking(domainsById.get(id)))
|
||||
list.add(id);
|
||||
}
|
||||
|
||||
|
||||
return list;
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
private static int[] sortOrder(double[] values) {
|
||||
|
||||
public void incrementAll(double v) {
|
||||
for (int i = 0; i < rank.length; i++) {
|
||||
rank[i]+=v;
|
||||
}
|
||||
}
|
||||
int[] ret = new int[values.length];
|
||||
Arrays.setAll(ret, i->i);
|
||||
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
|
||||
|
||||
int size() {
|
||||
return domainsById.size();
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,10 +1,10 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
|
||||
public class BetterReversePageRank extends RankingAlgorithm {
|
||||
public class ReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@ -20,8 +20,6 @@ public class BetterReversePageRank extends RankingAlgorithm {
|
||||
double newRankValue = 0;
|
||||
|
||||
if (links != null && links.size() > 0) {
|
||||
|
||||
|
||||
for (int j = 0; j < links.size(); j++) {
|
||||
var revLinks = linkDataDest2Src[links.getQuick(j)];
|
||||
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
|
@ -1,9 +1,9 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
|
||||
public class BetterStandardPageRank extends RankingAlgorithm {
|
||||
public class StandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@ -38,8 +38,7 @@ public class BetterStandardPageRank extends RankingAlgorithm {
|
||||
|
||||
@Override
|
||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
||||
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() /* dNorm/originDomainIds.size() */ ));
|
||||
// vector.incrementAll(0.14*dNorm/vector.size());
|
||||
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||
|
||||
public interface RankingResultAccumulator<T> {
|
||||
void add(int domainId, int rank);
|
||||
T get();
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
public class RankingResultBitSetAccumulator implements RankingResultAccumulator<RoaringBitmap> {
|
||||
private final RoaringBitmap result = new RoaringBitmap();
|
||||
|
||||
@Override
|
||||
public void add(int domainId, int rank) {
|
||||
result.add(domainId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public RoaringBitmap get() {
|
||||
return result;
|
||||
}
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
|
||||
public class RankingResultHashMapAccumulator implements RankingResultAccumulator<Int2IntOpenHashMap> {
|
||||
private final Int2IntOpenHashMap result;
|
||||
|
||||
public RankingResultHashMapAccumulator(int size) {
|
||||
result = new Int2IntOpenHashMap(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(int domainId, int rank) {
|
||||
result.put(domainId, rank);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Int2IntOpenHashMap get() {
|
||||
return result;
|
||||
}
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
|
||||
public class RankingResultListAccumulator implements RankingResultAccumulator<TIntArrayList> {
|
||||
private final TIntArrayList result;
|
||||
|
||||
public RankingResultListAccumulator(int size) {
|
||||
result = new TIntArrayList(size);
|
||||
}
|
||||
public RankingResultListAccumulator() {
|
||||
result = new TIntArrayList(10_000);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(int domainId, int rank) {
|
||||
result.add(domainId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TIntArrayList get() {
|
||||
return result;
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.util.ranking.old;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.old;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.util.ranking.old;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.old;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
@ -125,7 +125,6 @@ public class StandardPageRank {
|
||||
|
||||
final TIntArrayList empty = new TIntArrayList();
|
||||
|
||||
double rankNorm = rank.norm();
|
||||
RankVector newRank = new RankVector(0);
|
||||
|
||||
for (DomainData domain : domains.valueCollection()) {
|
||||
@ -176,8 +175,6 @@ public class StandardPageRank {
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
TIntHashSet deadEnds = new TIntHashSet(domains.size());
|
||||
}
|
||||
|
||||
private class RankVector {
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
@ -10,9 +10,9 @@ import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.RankingAlgorithm;
|
||||
import nu.marginalia.util.ranking.RankingDomainData;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainData;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
@ -33,8 +33,6 @@ public class PerusePageRankV2 {
|
||||
TIntArrayList[] linkDataSrc2Dest;
|
||||
TIntArrayList[] linkDataDest2Src;
|
||||
|
||||
private static final boolean getNames = true;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
static final LinkedBlockingQueue<LinkAdjacencies> uploadQueue = new LinkedBlockingQueue<>(10);
|
@ -1,9 +1,10 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
@ -17,8 +18,6 @@ public class UpdateDomainRanksTool2 {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
|
||||
|
||||
public final long domainIdMax = -1;
|
||||
public int domainCount;
|
||||
private volatile static int rankMax;
|
||||
|
||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
@ -35,20 +34,21 @@ public class UpdateDomainRanksTool2 {
|
||||
logger.info("Ranking");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
var rpr = new ReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
|
||||
var rankVector = rpr.pageRankVector();
|
||||
rankMax = rpr.size();
|
||||
uploader.start();
|
||||
|
||||
var rankData = rpr.pageRankWithPeripheralNodes(rankMax);
|
||||
for (int i : rankData) {
|
||||
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
|
||||
|
||||
rankData.forEach(i -> {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
running = false;
|
@ -129,6 +129,7 @@ public class EdgeIndexQueryService {
|
||||
specsSet.quality,
|
||||
specsSet.year,
|
||||
specsSet.size,
|
||||
specsSet.rank,
|
||||
getSearchSet(specsSet),
|
||||
specsSet.queryStrategy);
|
||||
}
|
||||
|
@ -2,51 +2,43 @@ package nu.marginalia.wmsa.edge.index.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexSearchSetsService {
|
||||
private final HikariDataSource dataSource;
|
||||
private RankingDomainFetcher rankingDomains;
|
||||
private final RankingDomainFetcher rankingDomains;
|
||||
private final RankingSettings rankingSettings;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final SearchSet anySet = new SearchSetAny();
|
||||
private volatile RankingSearchSet retroSet;
|
||||
private volatile RankingSearchSet smallWebSet;
|
||||
private volatile RankingSearchSet academiaSet;
|
||||
|
||||
private volatile DomainRankings domainRankings = new DomainRankings();
|
||||
|
||||
@Inject
|
||||
public EdgeIndexSearchSetsService(HikariDataSource dataSource,
|
||||
RankingDomainFetcher rankingDomains,
|
||||
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
|
||||
RankingSettings rankingSettings,
|
||||
IndexServicesFactory servicesFactory) throws IOException {
|
||||
this.dataSource = dataSource;
|
||||
this.rankingDomains = rankingDomains;
|
||||
this.rankingSettings = rankingSettings;
|
||||
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
||||
|
||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
||||
}
|
||||
|
||||
public void recalculateAll() {
|
||||
@ -55,52 +47,27 @@ public class EdgeIndexSearchSetsService {
|
||||
updateSmallWebDomains();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public RoaringBitmap goodUrls() {
|
||||
RoaringBitmap domains = new RoaringBitmap();
|
||||
RoaringBitmap urls = new RoaringBitmap();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
|
||||
stmt.setFetchSize(10_000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
domains.add(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
|
||||
// For some reason, doing this "INNER JOIN" in Java is significantly faster than doing it in SQL
|
||||
try (var stmt = connection.prepareStatement("SELECT ID,DOMAIN_ID FROM EC_URL WHERE VISITED AND EC_URL.STATE='OK'")) {
|
||||
stmt.setFetchSize(10_000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
if (domains.contains(rsp.getInt(2))) {
|
||||
urls.add(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return urls;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateRetroDomains() {
|
||||
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2);
|
||||
var spr = new StandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||
retroSet.write();
|
||||
}
|
||||
|
||||
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
|
||||
synchronized (this) {
|
||||
domainRankings = new DomainRankings(ranks);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateSmallWebDomains() {
|
||||
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
||||
var rpr = new ReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
||||
rpr.setMaxKnownUrls(750);
|
||||
var data = rpr.pageRankWithPeripheralNodes(rpr.size());
|
||||
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||
@ -110,8 +77,8 @@ public class EdgeIndexSearchSetsService {
|
||||
|
||||
@SneakyThrows
|
||||
public void updateAcademiaDomains() {
|
||||
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||
var spr = new StandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||
@ -119,41 +86,8 @@ public class EdgeIndexSearchSetsService {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getStandardDomains() {
|
||||
TIntArrayList results = new TIntArrayList();
|
||||
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement(
|
||||
"""
|
||||
SELECT ID FROM EC_DOMAIN
|
||||
WHERE INDEXED>0
|
||||
AND STATE='ACTIVE'
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY ID ASC
|
||||
""");
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
results.add(rs.getInt(1));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getSpecialDomains() {
|
||||
TIntArrayList results = new TIntArrayList();
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
results.add(rs.getInt(1));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
public DomainRankings getDomainRankings() {
|
||||
return domainRankings;
|
||||
}
|
||||
|
||||
public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) {
|
||||
|
@ -24,6 +24,11 @@ public record EdgeSearchResultKeywordScore(int set,
|
||||
sum += 20;
|
||||
}
|
||||
|
||||
int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13;
|
||||
if (rank < 0)
|
||||
sum += rank / 2;
|
||||
else
|
||||
sum += rank / 4;
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
@ -19,6 +19,7 @@ public class EdgeSearchSpecification {
|
||||
public final SpecificationLimit quality;
|
||||
public final SpecificationLimit year;
|
||||
public final SpecificationLimit size;
|
||||
public final SpecificationLimit rank;
|
||||
|
||||
public final QueryLimits queryLimits;
|
||||
public final QueryStrategy queryStrategy;
|
||||
|
@ -97,6 +97,7 @@ public class QueryFactory {
|
||||
SpecificationLimit qualityLimit = profile.getQualityLimit();
|
||||
SpecificationLimit year = profile.getYearLimit();
|
||||
SpecificationLimit size = profile.getSizeLimit();
|
||||
SpecificationLimit rank = SpecificationLimit.none();
|
||||
|
||||
for (Token t : basicQuery) {
|
||||
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
|
||||
@ -116,6 +117,9 @@ public class QueryFactory {
|
||||
if (t.type == TokenType.SIZE_TERM) {
|
||||
size = parseSpecificationLimit(t.str);
|
||||
}
|
||||
if (t.type == TokenType.RANK_TERM) {
|
||||
rank = parseSpecificationLimit(t.str);
|
||||
}
|
||||
if (t.type == TokenType.QS_TERM) {
|
||||
queryStrategy = parseQueryStrategy(t.str);
|
||||
}
|
||||
@ -154,6 +158,8 @@ public class QueryFactory {
|
||||
case QUALITY_TERM:
|
||||
case YEAR_TERM:
|
||||
case SIZE_TERM:
|
||||
case RANK_TERM:
|
||||
case QS_TERM:
|
||||
break; //
|
||||
case NEAR_TERM:
|
||||
near = t.str;
|
||||
@ -199,6 +205,7 @@ public class QueryFactory {
|
||||
.quality(qualityLimit)
|
||||
.year(year)
|
||||
.size(size)
|
||||
.rank(rank)
|
||||
.domains(domains)
|
||||
.queryStrategy(queryStrategy)
|
||||
.searchSetIdentifier(profile.searchSetIdentifier);
|
||||
|
@ -93,6 +93,8 @@ public class QueryParser {
|
||||
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
|
||||
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
|
||||
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("qs=")) {
|
||||
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
|
||||
} else if (t.str.contains(":")) {
|
||||
@ -508,6 +510,7 @@ enum TokenType implements Predicate<Token> {
|
||||
QUALITY_TERM,
|
||||
YEAR_TERM,
|
||||
SIZE_TERM,
|
||||
RANK_TERM,
|
||||
NEAR_TERM,
|
||||
|
||||
QS_TERM,
|
||||
|
@ -56,6 +56,9 @@
|
||||
<tr><td>year=2005</td><td>(beta) The document was ostensibly published in 2005</td></tr>
|
||||
<tr><td>year<2005</td><td>(beta) The document was ostensibly published in or before 2005</td></tr>
|
||||
|
||||
<tr><td>rank>50</td><td>(beta) The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
|
||||
<tr><td>year<50</td><td>(beta) The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
|
||||
|
||||
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
|
||||
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
|
||||
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>
|
||||
|
@ -3,13 +3,15 @@ package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class EdgePageDocumentsMetadataTest {
|
||||
|
||||
@Test
|
||||
public void codecYear() {
|
||||
var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, (byte) 0);
|
||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 192, 0, 0, (byte) 0);
|
||||
long encoded = meta.encode();
|
||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||
assertEquals(192, decoded.year());
|
||||
@ -17,7 +19,7 @@ class EdgePageDocumentsMetadataTest {
|
||||
|
||||
@Test
|
||||
public void codecTopology() {
|
||||
var meta = new EdgePageDocumentsMetadata(0, 192, 0, 0, 0, (byte) 0);
|
||||
var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, 0, (byte) 0);
|
||||
long encoded = meta.encode();
|
||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||
assertEquals(192, decoded.topology());
|
||||
@ -25,7 +27,7 @@ class EdgePageDocumentsMetadataTest {
|
||||
|
||||
@Test
|
||||
public void codecSets() {
|
||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 14, 0, (byte) 0);
|
||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 14, 0, (byte) 0);
|
||||
long encoded = meta.encode();
|
||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||
assertEquals(14, decoded.sets());
|
||||
@ -33,7 +35,7 @@ class EdgePageDocumentsMetadataTest {
|
||||
|
||||
@Test
|
||||
public void codecQuality() {
|
||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 9, (byte) 0);
|
||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 9, (byte) 0);
|
||||
long encoded = meta.encode();
|
||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||
assertEquals(9, decoded.quality());
|
||||
@ -41,7 +43,7 @@ class EdgePageDocumentsMetadataTest {
|
||||
|
||||
@Test
|
||||
public void codecFlags() {
|
||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, (byte) 255);
|
||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 0, (byte) 255);
|
||||
long encoded = meta.encode();
|
||||
System.out.println(Long.toHexString(encoded));
|
||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||
@ -57,7 +59,17 @@ class EdgePageDocumentsMetadataTest {
|
||||
assertEquals(50, new EdgePageDocumentsMetadata(0).withSize(4).size());
|
||||
assertEquals(50, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(4).encode()));
|
||||
|
||||
assertEquals(50*255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode()));
|
||||
assertEquals(50*255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size());
|
||||
assertEquals(50 * 255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode()));
|
||||
assertEquals(50 * 255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void encRank() {
|
||||
var meta = new EdgePageDocumentsMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class))
|
||||
.withSize(0xffffffff).encode();
|
||||
var enc2 = EdgePageDocumentsMetadata.encodeRank(meta, 83);
|
||||
|
||||
assertEquals(83, EdgePageDocumentsMetadata.decodeRank(enc2));
|
||||
assertEquals(5, EdgePageDocumentsMetadata.decodeTopology(enc2));
|
||||
}
|
||||
}
|
@ -6,6 +6,7 @@ import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.util.test.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
||||
@ -36,7 +37,6 @@ class ForwardIndexConverterTest {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
Path dataDir;
|
||||
private Path wordsFile;
|
||||
private Path docsFileId;
|
||||
private Path docsFileData;
|
||||
|
||||
@ -71,7 +71,6 @@ class ForwardIndexConverterTest {
|
||||
|
||||
var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile));
|
||||
|
||||
wordsFile = dataDir.resolve("words.dat");
|
||||
docsFileId = dataDir.resolve("docs-i.dat");
|
||||
docsFileData = dataDir.resolve("docs-d.dat");
|
||||
}
|
||||
@ -104,18 +103,15 @@ class ForwardIndexConverterTest {
|
||||
@Test
|
||||
void testForwardIndex() throws IOException {
|
||||
|
||||
Path tmpDir = Path.of("/tmp");
|
||||
|
||||
new ForwardIndexConverter(tmpDir, indexFile.toFile(), docsFileId, docsFileData).convert();
|
||||
new ForwardIndexConverter(indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert();
|
||||
|
||||
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
|
||||
|
||||
for (int i = 36; i < workSetSize; i++) {
|
||||
assertEquals(i % 5, forwardReader.getDocMeta(i));
|
||||
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i));
|
||||
assertEquals(i/20, forwardReader.getDomainId(i));
|
||||
}
|
||||
|
||||
TestUtil.clearTempDir(dataDir);
|
||||
}
|
||||
|
||||
|
||||
|
@ -82,6 +82,7 @@ public class EdgeIndexIntegrationTest {
|
||||
.year(SpecificationLimit.none())
|
||||
.quality(SpecificationLimit.none())
|
||||
.size(SpecificationLimit.none())
|
||||
.rank(SpecificationLimit.none())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||
.subqueries(List.of(new EdgeSearchSubquery(
|
||||
@ -113,6 +114,7 @@ public class EdgeIndexIntegrationTest {
|
||||
.year(SpecificationLimit.none())
|
||||
.quality(SpecificationLimit.none())
|
||||
.size(SpecificationLimit.none())
|
||||
.rank(SpecificationLimit.none())
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.domains(List.of(2))
|
||||
.subqueries(List.of(new EdgeSearchSubquery(
|
||||
@ -139,6 +141,7 @@ public class EdgeIndexIntegrationTest {
|
||||
.quality(SpecificationLimit.none())
|
||||
.year(SpecificationLimit.equals(1998))
|
||||
.size(SpecificationLimit.none())
|
||||
.rank(SpecificationLimit.none())
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||
.subqueries(List.of(new EdgeSearchSubquery(
|
||||
@ -161,7 +164,7 @@ public class EdgeIndexIntegrationTest {
|
||||
|
||||
long fullId = id | ((long) (32 - (id % 32)) << 32);
|
||||
|
||||
var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||
var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||
|
||||
long[] data = new long[factors.length*2];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
|
@ -4,6 +4,7 @@ import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
||||
import nu.marginalia.util.test.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||
@ -42,11 +43,12 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule {
|
||||
System.setProperty("small-ram", "true");
|
||||
try {
|
||||
bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"),
|
||||
slowDir, fastDir, null
|
||||
slowDir, fastDir
|
||||
));
|
||||
|
||||
EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class);
|
||||
when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny());
|
||||
when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings());
|
||||
|
||||
bind(EdgeIndexSearchSetsService.class).toInstance(setsServiceMock);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user