Reintroduce the ability to filter search results by their ranking.

This commit is contained in:
Viktor Lofgren 2023-02-03 13:31:51 +01:00
parent 4a07eda61c
commit 04f905f3a1
37 changed files with 305 additions and 536 deletions

View File

@ -1,39 +0,0 @@
package nu.marginalia.util.ranking;
public class BuggyReversePageRank extends RankingAlgorithm {
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override
RankVector createNewRankVector(RankVector rank) {
double rankNorm = rank.norm();
RankVector newRank = new RankVector(0);
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
var links = linkDataSrc2Dest[domainId];
if (links != null && links.size() > 0) {
double newRankValue = 0;
for (int j = 0; j < links.size(); j++) {
newRankValue += rank.get(links.getQuick(j)) / links.size();
}
newRank.set(domainId, 0.85*newRankValue/rankNorm);
}
}
return newRank;
}
@Override
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
originDomainIds.forEach(id -> vector.increment(domainIdToIndex.get(id), dNorm/oldNorm));
}
}

View File

@ -1,45 +0,0 @@
package nu.marginalia.util.ranking;
public class BuggyStandardPageRank extends RankingAlgorithm {
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override
RankingAlgorithm.RankVector createNewRankVector(RankingAlgorithm.RankVector rank) {
RankVector newRank = new RankVector(0);
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
var links = linkDataSrc2Dest[domainId];
double newRankValue = 0;
if (links != null && links.size() > 0) {
for (int j = 0; j < links.size(); j++) {
int linkedDomain = links.getQuick(j);
int linkSize = 1;
var bl = linkDataSrc2Dest[linkedDomain];
if (bl != null) {
linkSize = bl.size();
}
newRankValue += rank.get(linkedDomain) / linkSize;
}
}
newRank.set(domainId, 0.85 * newRankValue);
}
return newRank;
}
@Override
void adjustRankVector(RankingAlgorithm.RankVector vector, double dNorm, double oldNorm) {
originDomainIds.forEach(id -> vector.increment(id, dNorm/originDomainIds.size()));
vector.incrementAll(0.14*dNorm/vector.size());
}
}

View File

@ -1,89 +0,0 @@
package nu.marginalia.util.ranking.tool;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.ToString;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DedupTool {
private static final Logger logger = LoggerFactory.getLogger(DedupTool.class);
public Set<String> originDomains = new HashSet<>();
public Set<Integer> originDomainIds = new HashSet<>();
public final long domainIdMax = -1;
public int domainCount;
private volatile static int rankMax;
public int maxId() {
return (int) domainIdMax;
}
public int domainCount() {
return domainCount;
}
static LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@AllArgsConstructor @ToString @Getter
static class Data {
String url;
int id;
String domain;
}
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var ds = new DatabaseModule().provideConnection();
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
try (var conn = ds.getConnection();
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
) {
fetchStmt.setFetchSize(10_000);
var rsp = fetchStmt.executeQuery();
while (rsp.next()) {
domainToHashToUrl.computeIfAbsent(rsp.getInt(1), i -> new HashMap<>())
.computeIfAbsent(rsp.getInt(2), i -> new ArrayList<>()).add(new Data(rsp.getString(3), rsp.getInt(4), rsp.getString(5)));
}
List<Integer> updateIds = new ArrayList<>();
domainToHashToUrl.forEach((domain, hashes) -> {
hashes.forEach((hash, urls) -> {
if (urls.size() > 1) {
Comparator<Data> c = Comparator.comparing(d -> d.domain.length());
var urls2 = urls.stream().sorted(c.thenComparing(d -> d.url.length()))
.collect(Collectors.partitioningBy(d -> d.url.endsWith("/")));
Stream
.concat(urls2.get(true).stream(),urls2.get(false).stream()).skip(1)
.map(Data::getId)
.forEach(updateIds::add);
}
});
});
for (int id : updateIds) {
updateStmt.setInt(1, id);
updateStmt.executeUpdate();
}
}
}
}

View File

@ -1,93 +0,0 @@
package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
public class UpdateDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
public Set<String> originDomains = new HashSet<>();
public Set<Integer> originDomainIds = new HashSet<>();
public final long domainIdMax = -1;
public int domainCount;
private volatile static int rankMax;
public int maxId() {
return (int) domainIdMax;
}
public int domainCount() {
return domainCount;
}
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@SneakyThrows
public static void main(String... args) {
org.mariadb.jdbc.Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
long start = System.currentTimeMillis();
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
rankMax = spr.size()*2;
uploader.start();
var rankData = spr.pageRankWithPeripheralNodes(rankMax);
for (int i : rankData) {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
long end = System.currentTimeMillis();
running = false;
uploader.join();
logger.info("Done in {}", (end - start)/1000.0);
}
public static void uploadThread(HikariDataSource dataSource) {
int i = 0;
try (var conn = dataSource.getConnection()) {
logger.info("Resetting rank");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) {
stmt.executeUpdate();
}
logger.info("Updating ranks");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) {
while (running || (!running && !uploadQueue.isEmpty())) {
var job = uploadQueue.take();
stmt.setDouble(1, i++ / (double) rankMax);
stmt.setInt(2, job);
stmt.executeUpdate();
}
}
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
}
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index;
import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
import java.io.IOException;
@ -9,14 +10,16 @@ import java.io.IOException;
public class EdgeIndexControl {
private final IndexServicesFactory servicesFactory;
private final EdgeIndexSearchSetsService searchSetsService;
@Inject
public EdgeIndexControl(IndexServicesFactory servicesFactory) {
public EdgeIndexControl(IndexServicesFactory servicesFactory, EdgeIndexSearchSetsService searchSetsService) {
this.servicesFactory = servicesFactory;
this.searchSetsService = searchSetsService;
}
public void regenerateIndex() throws IOException {
servicesFactory.convertIndex();
servicesFactory.convertIndex(searchSetsService.getDomainRankings());
System.gc();
}

View File

@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.postings.SearchIndex;
import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter;
@ -20,6 +21,7 @@ import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter;
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader;
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters;
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -33,7 +35,6 @@ import java.util.concurrent.Callable;
@Singleton
public class IndexServicesFactory {
private final Path tmpFileDir;
private final EdgeDomainBlacklist domainBlacklist;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -58,12 +59,10 @@ public class IndexServicesFactory {
public IndexServicesFactory(
@Named("tmp-file-dir") Path tmpFileDir,
@Named("partition-root-slow") Path partitionRootSlow,
@Named("partition-root-fast") Path partitionRootFast,
EdgeDomainBlacklist domainBlacklist
@Named("partition-root-fast") Path partitionRootFast
) throws IOException {
this.tmpFileDir = tmpFileDir;
this.domainBlacklist = domainBlacklist;
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat");
@ -106,8 +105,8 @@ public class IndexServicesFactory {
}
public void convertIndex() throws IOException {
convertForwardIndex();
public void convertIndex(DomainRankings domainRankings) throws IOException {
convertForwardIndex(domainRankings);
convertFullReverseIndex();
convertPriorityReverseIndex();
@ -148,13 +147,14 @@ public class IndexServicesFactory {
tryGc();
}
private void convertForwardIndex() throws IOException {
private void convertForwardIndex(DomainRankings domainRankings) throws IOException {
logger.info("Converting forward index data");
new ForwardIndexConverter(tmpFileDir,
new ForwardIndexConverter(
writerIndexFile.get(0),
fwdIndexDocId.get(NEXT_PART).toPath(),
fwdIndexDocData.get(NEXT_PART).toPath())
fwdIndexDocData.get(NEXT_PART).toPath(),
domainRankings)
.convert();
tryGc();
@ -212,8 +212,8 @@ public class IndexServicesFactory {
}
}
public SearchIndex createIndexBucket() {
return new SearchIndex(this, new EdgeIndexControl(this));
public SearchIndex createIndexBucket(EdgeIndexSearchSetsService searchSetsService) {
return new SearchIndex(this, new EdgeIndexControl(this, searchSetsService));
}
public SearchIndexReader getSearchIndexReader() throws IOException {

View File

@ -8,7 +8,8 @@ import java.util.Set;
import static java.lang.Math.max;
import static java.lang.Math.min;
public record EdgePageDocumentsMetadata(int encSize,
public record EdgePageDocumentsMetadata(int rank,
int encSize,
int topology,
int year,
int sets,
@ -16,9 +17,13 @@ public record EdgePageDocumentsMetadata(int encSize,
byte flags) {
public static final long RANK_MASK = 0xFFL;
public static final int RANK_SHIFT = 48;
public static final long ENCSIZE_MASK = 0xFFL;
public static final int ENCSIZE_SHIFT = 48;
public static final int ENCSIZE_SHIFT = 40;
public static final int ENCSIZE_MULTIPLIER = 50;
public static final long TOPOLOGY_MASK = 0xFFL;
public static final int TOPOLOGY_SHIFT = 32;
@ -39,7 +44,7 @@ public record EdgePageDocumentsMetadata(int encSize,
this(defaultValue());
}
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
this(0, topology, year, sets, quality, encodeFlags(flags));
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
}
public EdgePageDocumentsMetadata withSize(int size) {
@ -49,7 +54,7 @@ public record EdgePageDocumentsMetadata(int encSize,
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
return new EdgePageDocumentsMetadata(encSize, topology, year, sets, quality, flags);
return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags);
}
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
@ -63,7 +68,8 @@ public record EdgePageDocumentsMetadata(int encSize,
}
public EdgePageDocumentsMetadata(long value) {
this( (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
this( (int) ((value >>> RANK_SHIFT) & RANK_MASK),
(int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
(int) ((value >>> YEAR_SHIFT) & YEAR_MASK),
(int) ((value >>> SETS_SHIFT) & SETS_MASK),
@ -84,12 +90,13 @@ public record EdgePageDocumentsMetadata(int encSize,
ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT;
ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT;
ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT;
ret |= min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
return ret;
}
public boolean isEmpty() {
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0;
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0 && rank == 0;
}
public static int decodeQuality(long encoded) {
@ -112,6 +119,12 @@ public record EdgePageDocumentsMetadata(int encSize,
return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK);
}
public static int decodeRank(long encoded) {
return (int) ((encoded >>> RANK_SHIFT) & RANK_MASK);
}
public static long encodeRank(long encoded, int rank) {
return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
}
}

View File

@ -0,0 +1,43 @@
package nu.marginalia.wmsa.edge.index.postings;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.lang.Math.max;
import static java.lang.Math.min;
public class DomainRankings {
private final Int2ShortOpenHashMap rankings;
private final int MAX_MEANINGFUL_RANK = 50_000;
private final int MAX_RANK_VALUE = 255;
private final int MIN_RANK_VALUE = 1;
private final double RANK_SCALING_FACTOR = (double) MAX_RANK_VALUE / MAX_MEANINGFUL_RANK;
public DomainRankings() {
rankings = new Int2ShortOpenHashMap();
}
public DomainRankings(Int2IntOpenHashMap values) {
rankings = new Int2ShortOpenHashMap(values.size());
values.forEach(this::putRanking);
}
private void putRanking(int domainId, int value) {
rankings.put(domainId, scaleRank(value));
}
private short scaleRank(int value) {
double rankScaled = RANK_SCALING_FACTOR * value;
return (short) min(MAX_RANK_VALUE, max(MIN_RANK_VALUE, rankScaled));
}
public int getRanking(int domainId) {
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
}
public int size() {
return rankings.size();
}
}

View File

@ -6,6 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -26,13 +27,14 @@ public class SearchIndexControl {
@Inject
public SearchIndexControl(IndexServicesFactory servicesFactory,
EdgeOpsLockService opsLockService) {
EdgeOpsLockService opsLockService,
EdgeIndexSearchSetsService searchSetsService) {
this.servicesFactory = servicesFactory;
this.primaryIndexWriter = servicesFactory.getIndexWriter(0);
this.secondaryIndexWriter = servicesFactory.getIndexWriter(1);
index = servicesFactory.createIndexBucket();
index = servicesFactory.createIndexBucket(searchSetsService);
this.opsLockService = opsLockService;
}

View File

@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.index.postings.forward;
import com.upserve.uppend.blobs.NativeIO;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
import org.roaringbitmap.IntConsumer;
@ -18,26 +20,26 @@ import java.nio.file.Path;
import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*;
public class ForwardIndexConverter {
private static final int RWF_BIN_SIZE = 10_000_000;
private final Path tmpFileDir;
private final File inputFile;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Path outputFileDocsId;
private final Path outputFileDocsData;
private final DomainRankings domainRankings;
public ForwardIndexConverter(Path tmpFileDir,
public ForwardIndexConverter(
File inputFile,
Path outputFileDocsId,
Path outputFileDocsData
Path outputFileDocsData,
DomainRankings domainRankings
) {
this.tmpFileDir = tmpFileDir;
this.inputFile = inputFile;
this.outputFileDocsId = outputFileDocsId;
this.outputFileDocsData = outputFileDocsData;
this.domainRankings = domainRankings;
}
public void convert() throws IOException {
@ -50,6 +52,8 @@ public class ForwardIndexConverter {
logger.info("Converting {} {}",inputFile, journalReader.fileHeader);
logger.info("Domain Rankings size = {}", domainRankings.size());
try {
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
@ -68,7 +72,10 @@ public class ForwardIndexConverter {
journalReader.forEach(entry -> {
long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId());
docFileData.set(entryOffset + METADATA_OFFSET, entry.docMeta());
int ranking = domainRankings.getRanking(entry.domainId());
long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking);
docFileData.set(entryOffset + METADATA_OFFSET, meta);
docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId());
});

View File

@ -33,6 +33,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
if (!validateSize(post)) {
return false;
}
if (!validateRank(post)) {
return false;
}
return true;
}
@ -51,6 +56,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
return limit.test(quality);
}
private boolean validateYear(ForwardIndexReader.DocPost post) {
if (params.year().type() == SpecificationLimitType.NONE)
return true;
@ -69,6 +75,15 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
return params.size().test(postVal);
}
private boolean validateRank(ForwardIndexReader.DocPost post) {
if (params.rank().type() == SpecificationLimitType.NONE)
return true;
int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta());
return params.rank().test(postVal);
}
@Override
public double cost() {
return 32;

View File

@ -53,6 +53,11 @@ public class ReverseIndexReader {
}
public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) {
if (null == words) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
}
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
long offset = words.get(wordId);

View File

@ -7,6 +7,7 @@ import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
public record IndexQueryParams(SpecificationLimit qualityLimit,
SpecificationLimit year,
SpecificationLimit size,
SpecificationLimit rank,
SearchSet searchSet,
QueryStrategy queryStrategy
)

View File

@ -1,21 +1,19 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntComparator;
import org.roaringbitmap.RoaringBitmap;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import java.util.function.IntToDoubleFunction;
import java.util.stream.IntStream;
import java.util.function.Supplier;
import static java.lang.Math.min;
public abstract class RankingAlgorithm {
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
@ -133,29 +131,7 @@ public abstract class RankingAlgorithm {
return domainsById.size();
}
public RankVector pageRankVector() {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
for (int i = 0; i < iter_max; i++) {
RankVector newRank = createNewRankVector(rank);
double oldNorm = rank.norm();
double newNorm = newRank.norm();
double dNorm = oldNorm - newNorm ;
if (i < iter_max-1) {
adjustRankVector(newRank, dNorm, oldNorm);
}
rank = newRank;
}
return rank;
}
public RoaringBitmap pageRank(int resultCount) {
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
@ -174,10 +150,10 @@ public abstract class RankingAlgorithm {
}
return rank.getRanking(resultCount);
return rank.getRanking(resultCount, accumulatorP).get();
}
public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) {
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
@ -201,32 +177,11 @@ public abstract class RankingAlgorithm {
logger.info("PRWPN iteration done");
return rank.getRanking(resultCount);
return rank.getRanking(resultCount, accumulatorP).get();
}
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
public TIntList pageRank(IntToDoubleFunction weight, int resultCount) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
for (int i = 0; i < iter_max; i++) {
RankVector newRank = createNewRankVector(rank);
double oldNorm = rank.norm();
double newNorm = newRank.norm();
double dNorm = oldNorm - newNorm ;
if (i < iter_max-1) {
adjustRankVector(newRank, dNorm, oldNorm);
}
rank = newRank;
}
return rank.getRanking(weight, resultCount);
}
abstract RankVector createNewRankVector(RankVector rank);
public boolean includeInRanking(RankingDomainData data) {
@ -271,9 +226,8 @@ public abstract class RankingAlgorithm {
public double norm() {
double v = 0.;
for (int i = 0; i < rank.length; i++) {
if (rank[i] > 0) { v+=rank[i]; }
else { v -= rank[i]; }
for (double value : rank) {
v += Math.abs(value);
}
return v;
}
@ -281,73 +235,38 @@ public abstract class RankingAlgorithm {
public double norm(RankVector other) {
double v = 0.;
for (int i = 0; i < rank.length; i++) {
double dv = rank[i] - other.get(i);
if (dv > 0) { v+=dv; }
else { v -= dv; }
v += Math.abs(rank[i] - other.get(i));
}
return v;
}
public TIntList getRanking(IntToDoubleFunction other, int numResults) {
TIntArrayList list = new TIntArrayList(numResults);
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
Comparator<Integer> comparator = Comparator.comparing(i -> Math.sqrt(other.applyAsDouble(domainIdToIndex.get(i)) * rank[i]));
IntStream.range(0, rank.length)
.boxed()
.sorted(comparator.reversed())
.map(domainIndexToId::get)
.limit(numResults)
.forEach(list::add);
return list;
}
public RoaringBitmap getRanking(int numResults) {
if (numResults < 0) {
numResults = domainIdToIndex.size();
}
if (numResults >= rank.length) {
numResults = rank.length;
}
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
RoaringBitmap list = new RoaringBitmap();
int[] nodes = sortOrder(rank);
var accumulator = accumulatorP.get();
int[] nodes = new int[rank.length];
Arrays.setAll(nodes, i->i);
IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]);
IntArrays.quickSort(nodes, comp);
int i;
for (i = 0; i < numResults; i++) {
for (int i = 0; i < numResults; i++) {
int id = domainIndexToId.get(nodes[i]);
if (includeInRanking(domainsById.get(id)))
list.add(id);
accumulator.add(id, i);
}
for (; i < nodes.length && domainsById.size() < numResults; i++) {
int id = domainIndexToId.get(nodes[i]);
if (includeInRanking(domainsById.get(id)))
list.add(id);
}
return list;
return accumulator;
}
private static int[] sortOrder(double[] values) {
public void incrementAll(double v) {
for (int i = 0; i < rank.length; i++) {
rank[i]+=v;
}
}
int[] ret = new int[values.length];
Arrays.setAll(ret, i->i);
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
int size() {
return domainsById.size();
return ret;
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking;
import lombok.AllArgsConstructor;
import lombok.Data;

View File

@ -1,4 +1,4 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,10 +1,10 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking;
public class BetterReversePageRank extends RankingAlgorithm {
public class ReversePageRank extends RankingAlgorithm {
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@ -20,8 +20,6 @@ public class BetterReversePageRank extends RankingAlgorithm {
double newRankValue = 0;
if (links != null && links.size() > 0) {
for (int j = 0; j < links.size(); j++) {
var revLinks = linkDataDest2Src[links.getQuick(j)];
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();

View File

@ -1,9 +1,9 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking;
public class BetterStandardPageRank extends RankingAlgorithm {
public class StandardPageRank extends RankingAlgorithm {
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@ -38,8 +38,7 @@ public class BetterStandardPageRank extends RankingAlgorithm {
@Override
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() /* dNorm/originDomainIds.size() */ ));
// vector.incrementAll(0.14*dNorm/vector.size());
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
}
}

View File

@ -0,0 +1,6 @@
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
public interface RankingResultAccumulator<T> {
void add(int domainId, int rank);
T get();
}

View File

@ -0,0 +1,17 @@
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
import org.roaringbitmap.RoaringBitmap;
public class RankingResultBitSetAccumulator implements RankingResultAccumulator<RoaringBitmap> {
private final RoaringBitmap result = new RoaringBitmap();
@Override
public void add(int domainId, int rank) {
result.add(domainId);
}
@Override
public RoaringBitmap get() {
return result;
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
public class RankingResultHashMapAccumulator implements RankingResultAccumulator<Int2IntOpenHashMap> {
private final Int2IntOpenHashMap result;
public RankingResultHashMapAccumulator(int size) {
result = new Int2IntOpenHashMap(size);
}
@Override
public void add(int domainId, int rank) {
result.put(domainId, rank);
}
@Override
public Int2IntOpenHashMap get() {
return result;
}
}

View File

@ -0,0 +1,24 @@
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
import gnu.trove.list.array.TIntArrayList;
public class RankingResultListAccumulator implements RankingResultAccumulator<TIntArrayList> {
private final TIntArrayList result;
public RankingResultListAccumulator(int size) {
result = new TIntArrayList(size);
}
public RankingResultListAccumulator() {
result = new TIntArrayList(10_000);
}
@Override
public void add(int domainId, int rank) {
result.add(domainId);
}
@Override
public TIntArrayList get() {
return result;
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.util.ranking.old;
package nu.marginalia.wmsa.edge.index.ranking.old;
import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.util.ranking.old;
package nu.marginalia.wmsa.edge.index.ranking.old;
import com.zaxxer.hikari.HikariDataSource;
@ -125,7 +125,6 @@ public class StandardPageRank {
final TIntArrayList empty = new TIntArrayList();
double rankNorm = rank.norm();
RankVector newRank = new RankVector(0);
for (DomainData domain : domains.valueCollection()) {
@ -176,8 +175,6 @@ public class StandardPageRank {
}
});
}
TIntHashSet deadEnds = new TIntHashSet(domains.size());
}
private class RankVector {

View File

@ -1,4 +1,4 @@
package nu.marginalia.util.ranking.tool;
package nu.marginalia.wmsa.edge.index.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
@ -10,9 +10,9 @@ import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntComparator;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.RankingAlgorithm;
import nu.marginalia.util.ranking.RankingDomainData;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm;
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainData;
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.jetbrains.annotations.NotNull;
@ -33,8 +33,6 @@ public class PerusePageRankV2 {
TIntArrayList[] linkDataSrc2Dest;
TIntArrayList[] linkDataDest2Src;
private static final boolean getNames = true;
private final Logger logger = LoggerFactory.getLogger(getClass());
static final LinkedBlockingQueue<LinkAdjacencies> uploadQueue = new LinkedBlockingQueue<>(10);

View File

@ -1,9 +1,10 @@
package nu.marginalia.util.ranking.tool;
package nu.marginalia.wmsa.edge.index.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
@ -17,8 +18,6 @@ public class UpdateDomainRanksTool2 {
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
public final long domainIdMax = -1;
public int domainCount;
private volatile static int rankMax;
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
@ -35,20 +34,21 @@ public class UpdateDomainRanksTool2 {
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
var rpr = new ReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
var rankVector = rpr.pageRankVector();
rankMax = rpr.size();
uploader.start();
var rankData = rpr.pageRankWithPeripheralNodes(rankMax);
for (int i : rankData) {
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
rankData.forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return true;
});
long end = System.currentTimeMillis();
running = false;

View File

@ -129,6 +129,7 @@ public class EdgeIndexQueryService {
specsSet.quality,
specsSet.year,
specsSet.size,
specsSet.rank,
getSearchSet(specsSet),
specsSet.queryStrategy);
}

View File

@ -2,51 +2,43 @@ package nu.marginalia.wmsa.edge.index.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.BetterStandardPageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
import org.roaringbitmap.RoaringBitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
@Singleton
public class EdgeIndexSearchSetsService {
private final HikariDataSource dataSource;
private RankingDomainFetcher rankingDomains;
private final RankingDomainFetcher rankingDomains;
private final RankingSettings rankingSettings;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final SearchSet anySet = new SearchSetAny();
private volatile RankingSearchSet retroSet;
private volatile RankingSearchSet smallWebSet;
private volatile RankingSearchSet academiaSet;
private volatile DomainRankings domainRankings = new DomainRankings();
@Inject
public EdgeIndexSearchSetsService(HikariDataSource dataSource,
RankingDomainFetcher rankingDomains,
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
RankingSettings rankingSettings,
IndexServicesFactory servicesFactory) throws IOException {
this.dataSource = dataSource;
this.rankingDomains = rankingDomains;
this.rankingSettings = rankingSettings;
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
}
public void recalculateAll() {
@ -55,52 +47,27 @@ public class EdgeIndexSearchSetsService {
updateSmallWebDomains();
}
@SneakyThrows
public RoaringBitmap goodUrls() {
RoaringBitmap domains = new RoaringBitmap();
RoaringBitmap urls = new RoaringBitmap();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
stmt.setFetchSize(10_000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
domains.add(rsp.getInt(1));
}
}
// For some reason, doing this "INNER JOIN" in Java is significantly faster than doing it in SQL
try (var stmt = connection.prepareStatement("SELECT ID,DOMAIN_ID FROM EC_URL WHERE VISITED AND EC_URL.STATE='OK'")) {
stmt.setFetchSize(10_000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
if (domains.contains(rsp.getInt(2))) {
urls.add(rsp.getInt(1));
}
}
}
}
return urls;
}
@SneakyThrows
public void updateRetroDomains() {
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2);
var spr = new StandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
synchronized (this) {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
retroSet.write();
}
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
synchronized (this) {
domainRankings = new DomainRankings(ranks);
}
}
@SneakyThrows
public void updateSmallWebDomains() {
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
var rpr = new ReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
var data = rpr.pageRankWithPeripheralNodes(rpr.size());
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
synchronized (this) {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
@ -110,8 +77,8 @@ public class EdgeIndexSearchSetsService {
@SneakyThrows
public void updateAcademiaDomains() {
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size()/2);
var spr = new StandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
synchronized (this) {
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
@ -119,41 +86,8 @@ public class EdgeIndexSearchSetsService {
}
}
@SneakyThrows
public TIntList getStandardDomains() {
TIntArrayList results = new TIntArrayList();
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement(
"""
SELECT ID FROM EC_DOMAIN
WHERE INDEXED>0
AND STATE='ACTIVE'
AND DOMAIN_ALIAS IS NULL
ORDER BY ID ASC
""");
) {
var rs = stmt.executeQuery();
while (rs.next()) {
results.add(rs.getInt(1));
}
}
return results;
}
@SneakyThrows
public TIntList getSpecialDomains() {
TIntArrayList results = new TIntArrayList();
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
) {
var rs = stmt.executeQuery();
while (rs.next()) {
results.add(rs.getInt(1));
}
}
return results;
public DomainRankings getDomainRankings() {
return domainRankings;
}
public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) {

View File

@ -24,6 +24,11 @@ public record EdgeSearchResultKeywordScore(int set,
sum += 20;
}
int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13;
if (rank < 0)
sum += rank / 2;
else
sum += rank / 4;
return sum;
}

View File

@ -19,6 +19,7 @@ public class EdgeSearchSpecification {
public final SpecificationLimit quality;
public final SpecificationLimit year;
public final SpecificationLimit size;
public final SpecificationLimit rank;
public final QueryLimits queryLimits;
public final QueryStrategy queryStrategy;

View File

@ -97,6 +97,7 @@ public class QueryFactory {
SpecificationLimit qualityLimit = profile.getQualityLimit();
SpecificationLimit year = profile.getYearLimit();
SpecificationLimit size = profile.getSizeLimit();
SpecificationLimit rank = SpecificationLimit.none();
for (Token t : basicQuery) {
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
@ -116,6 +117,9 @@ public class QueryFactory {
if (t.type == TokenType.SIZE_TERM) {
size = parseSpecificationLimit(t.str);
}
if (t.type == TokenType.RANK_TERM) {
rank = parseSpecificationLimit(t.str);
}
if (t.type == TokenType.QS_TERM) {
queryStrategy = parseQueryStrategy(t.str);
}
@ -154,6 +158,8 @@ public class QueryFactory {
case QUALITY_TERM:
case YEAR_TERM:
case SIZE_TERM:
case RANK_TERM:
case QS_TERM:
break; //
case NEAR_TERM:
near = t.str;
@ -199,6 +205,7 @@ public class QueryFactory {
.quality(qualityLimit)
.year(year)
.size(size)
.rank(rank)
.domains(domains)
.queryStrategy(queryStrategy)
.searchSetIdentifier(profile.searchSetIdentifier);

View File

@ -93,6 +93,8 @@ public class QueryParser {
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("qs=")) {
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
} else if (t.str.contains(":")) {
@ -508,6 +510,7 @@ enum TokenType implements Predicate<Token> {
QUALITY_TERM,
YEAR_TERM,
SIZE_TERM,
RANK_TERM,
NEAR_TERM,
QS_TERM,

View File

@ -56,6 +56,9 @@
<tr><td>year=2005</td><td>(beta) The document was ostensibly published in 2005</td></tr>
<tr><td>year&lt;2005</td><td>(beta) The document was ostensibly published in or before 2005</td></tr>
<tr><td>rank&gt;50</td><td>(beta) The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
<tr><td>year&lt;50</td><td>(beta) The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>

View File

@ -3,13 +3,15 @@ package nu.marginalia.wmsa.edge.index.model;
import org.junit.jupiter.api.Test;
import java.util.EnumSet;
import static org.junit.jupiter.api.Assertions.assertEquals;
class EdgePageDocumentsMetadataTest {
@Test
public void codecYear() {
var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, (byte) 0);
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 192, 0, 0, (byte) 0);
long encoded = meta.encode();
var decoded = new EdgePageDocumentsMetadata(encoded);
assertEquals(192, decoded.year());
@ -17,7 +19,7 @@ class EdgePageDocumentsMetadataTest {
@Test
public void codecTopology() {
var meta = new EdgePageDocumentsMetadata(0, 192, 0, 0, 0, (byte) 0);
var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, 0, (byte) 0);
long encoded = meta.encode();
var decoded = new EdgePageDocumentsMetadata(encoded);
assertEquals(192, decoded.topology());
@ -25,7 +27,7 @@ class EdgePageDocumentsMetadataTest {
@Test
public void codecSets() {
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 14, 0, (byte) 0);
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 14, 0, (byte) 0);
long encoded = meta.encode();
var decoded = new EdgePageDocumentsMetadata(encoded);
assertEquals(14, decoded.sets());
@ -33,7 +35,7 @@ class EdgePageDocumentsMetadataTest {
@Test
public void codecQuality() {
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 9, (byte) 0);
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 9, (byte) 0);
long encoded = meta.encode();
var decoded = new EdgePageDocumentsMetadata(encoded);
assertEquals(9, decoded.quality());
@ -41,7 +43,7 @@ class EdgePageDocumentsMetadataTest {
@Test
public void codecFlags() {
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, (byte) 255);
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 0, (byte) 255);
long encoded = meta.encode();
System.out.println(Long.toHexString(encoded));
var decoded = new EdgePageDocumentsMetadata(encoded);
@ -57,7 +59,17 @@ class EdgePageDocumentsMetadataTest {
assertEquals(50, new EdgePageDocumentsMetadata(0).withSize(4).size());
assertEquals(50, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(4).encode()));
assertEquals(50*255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode()));
assertEquals(50*255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size());
assertEquals(50 * 255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode()));
assertEquals(50 * 255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size());
}
@Test
public void encRank() {
var meta = new EdgePageDocumentsMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class))
.withSize(0xffffffff).encode();
var enc2 = EdgePageDocumentsMetadata.encodeRank(meta, 83);
assertEquals(83, EdgePageDocumentsMetadata.decodeRank(enc2));
assertEquals(5, EdgePageDocumentsMetadata.decodeTopology(enc2));
}
}

View File

@ -6,6 +6,7 @@ import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
@ -36,7 +37,6 @@ class ForwardIndexConverterTest {
private final Logger logger = LoggerFactory.getLogger(getClass());
Path dataDir;
private Path wordsFile;
private Path docsFileId;
private Path docsFileData;
@ -71,7 +71,6 @@ class ForwardIndexConverterTest {
var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile));
wordsFile = dataDir.resolve("words.dat");
docsFileId = dataDir.resolve("docs-i.dat");
docsFileData = dataDir.resolve("docs-d.dat");
}
@ -104,18 +103,15 @@ class ForwardIndexConverterTest {
@Test
void testForwardIndex() throws IOException {
Path tmpDir = Path.of("/tmp");
new ForwardIndexConverter(tmpDir, indexFile.toFile(), docsFileId, docsFileData).convert();
new ForwardIndexConverter(indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert();
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
for (int i = 36; i < workSetSize; i++) {
assertEquals(i % 5, forwardReader.getDocMeta(i));
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i));
assertEquals(i/20, forwardReader.getDomainId(i));
}
TestUtil.clearTempDir(dataDir);
}

View File

@ -82,6 +82,7 @@ public class EdgeIndexIntegrationTest {
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.domains(new ArrayList<>())
.searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new EdgeSearchSubquery(
@ -113,6 +114,7 @@ public class EdgeIndexIntegrationTest {
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.queryStrategy(QueryStrategy.SENTENCE)
.domains(List.of(2))
.subqueries(List.of(new EdgeSearchSubquery(
@ -139,6 +141,7 @@ public class EdgeIndexIntegrationTest {
.quality(SpecificationLimit.none())
.year(SpecificationLimit.equals(1998))
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.queryStrategy(QueryStrategy.SENTENCE)
.searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new EdgeSearchSubquery(
@ -161,7 +164,7 @@ public class EdgeIndexIntegrationTest {
long fullId = id | ((long) (32 - (id % 32)) << 32);
var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, id % 5, id, id % 20, (byte) 0).encode());
var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {

View File

@ -4,6 +4,7 @@ import com.google.inject.AbstractModule;
import com.google.inject.name.Names;
import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
@ -42,11 +43,12 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule {
System.setProperty("small-ram", "true");
try {
bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"),
slowDir, fastDir, null
slowDir, fastDir
));
EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class);
when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny());
when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings());
bind(EdgeIndexSearchSetsService.class).toInstance(setsServiceMock);