mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Reintroduce the ability to filter search results by their ranking.
This commit is contained in:
parent
4a07eda61c
commit
04f905f3a1
@ -1,39 +0,0 @@
|
|||||||
package nu.marginalia.util.ranking;
|
|
||||||
|
|
||||||
|
|
||||||
public class BuggyReversePageRank extends RankingAlgorithm {
|
|
||||||
|
|
||||||
|
|
||||||
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
|
|
||||||
super(domains, origins);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
RankVector createNewRankVector(RankVector rank) {
|
|
||||||
|
|
||||||
double rankNorm = rank.norm();
|
|
||||||
RankVector newRank = new RankVector(0);
|
|
||||||
|
|
||||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
|
||||||
|
|
||||||
var links = linkDataSrc2Dest[domainId];
|
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
|
||||||
double newRankValue = 0;
|
|
||||||
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
|
||||||
newRankValue += rank.get(links.getQuick(j)) / links.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
newRank.set(domainId, 0.85*newRankValue/rankNorm);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
|
||||||
originDomainIds.forEach(id -> vector.increment(domainIdToIndex.get(id), dNorm/oldNorm));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,45 +0,0 @@
|
|||||||
package nu.marginalia.util.ranking;
|
|
||||||
|
|
||||||
|
|
||||||
public class BuggyStandardPageRank extends RankingAlgorithm {
|
|
||||||
|
|
||||||
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
|
||||||
super(domains, origins);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
RankingAlgorithm.RankVector createNewRankVector(RankingAlgorithm.RankVector rank) {
|
|
||||||
RankVector newRank = new RankVector(0);
|
|
||||||
|
|
||||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
|
||||||
|
|
||||||
var links = linkDataSrc2Dest[domainId];
|
|
||||||
double newRankValue = 0;
|
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
|
||||||
int linkedDomain = links.getQuick(j);
|
|
||||||
|
|
||||||
int linkSize = 1;
|
|
||||||
var bl = linkDataSrc2Dest[linkedDomain];
|
|
||||||
if (bl != null) {
|
|
||||||
linkSize = bl.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
newRankValue += rank.get(linkedDomain) / linkSize;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
newRank.set(domainId, 0.85 * newRankValue);
|
|
||||||
}
|
|
||||||
return newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
void adjustRankVector(RankingAlgorithm.RankVector vector, double dNorm, double oldNorm) {
|
|
||||||
originDomainIds.forEach(id -> vector.increment(id, dNorm/originDomainIds.size()));
|
|
||||||
vector.incrementAll(0.14*dNorm/vector.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,89 +0,0 @@
|
|||||||
package nu.marginalia.util.ranking.tool;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import lombok.ToString;
|
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
|
||||||
import org.mariadb.jdbc.Driver;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
public class DedupTool {
|
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DedupTool.class);
|
|
||||||
|
|
||||||
public Set<String> originDomains = new HashSet<>();
|
|
||||||
public Set<Integer> originDomainIds = new HashSet<>();
|
|
||||||
public final long domainIdMax = -1;
|
|
||||||
public int domainCount;
|
|
||||||
private volatile static int rankMax;
|
|
||||||
|
|
||||||
public int maxId() {
|
|
||||||
return (int) domainIdMax;
|
|
||||||
}
|
|
||||||
public int domainCount() {
|
|
||||||
return domainCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
static LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
|
||||||
volatile static boolean running = true;
|
|
||||||
|
|
||||||
@AllArgsConstructor @ToString @Getter
|
|
||||||
static class Data {
|
|
||||||
String url;
|
|
||||||
int id;
|
|
||||||
String domain;
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static void main(String... args) {
|
|
||||||
Driver driver = new Driver();
|
|
||||||
var ds = new DatabaseModule().provideConnection();
|
|
||||||
|
|
||||||
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
|
|
||||||
|
|
||||||
try (var conn = ds.getConnection();
|
|
||||||
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
|
||||||
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
|
|
||||||
|
|
||||||
) {
|
|
||||||
fetchStmt.setFetchSize(10_000);
|
|
||||||
var rsp = fetchStmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
domainToHashToUrl.computeIfAbsent(rsp.getInt(1), i -> new HashMap<>())
|
|
||||||
.computeIfAbsent(rsp.getInt(2), i -> new ArrayList<>()).add(new Data(rsp.getString(3), rsp.getInt(4), rsp.getString(5)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
List<Integer> updateIds = new ArrayList<>();
|
|
||||||
|
|
||||||
domainToHashToUrl.forEach((domain, hashes) -> {
|
|
||||||
hashes.forEach((hash, urls) -> {
|
|
||||||
if (urls.size() > 1) {
|
|
||||||
Comparator<Data> c = Comparator.comparing(d -> d.domain.length());
|
|
||||||
var urls2 = urls.stream().sorted(c.thenComparing(d -> d.url.length()))
|
|
||||||
.collect(Collectors.partitioningBy(d -> d.url.endsWith("/")));
|
|
||||||
|
|
||||||
Stream
|
|
||||||
.concat(urls2.get(true).stream(),urls2.get(false).stream()).skip(1)
|
|
||||||
.map(Data::getId)
|
|
||||||
.forEach(updateIds::add);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
for (int id : updateIds) {
|
|
||||||
updateStmt.setInt(1, id);
|
|
||||||
updateStmt.executeUpdate();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,93 +0,0 @@
|
|||||||
package nu.marginalia.util.ranking.tool;
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
|
||||||
import org.mariadb.jdbc.Driver;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
|
||||||
|
|
||||||
public class UpdateDomainRanksTool {
|
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
|
|
||||||
|
|
||||||
public Set<String> originDomains = new HashSet<>();
|
|
||||||
public Set<Integer> originDomainIds = new HashSet<>();
|
|
||||||
public final long domainIdMax = -1;
|
|
||||||
public int domainCount;
|
|
||||||
private volatile static int rankMax;
|
|
||||||
|
|
||||||
public int maxId() {
|
|
||||||
return (int) domainIdMax;
|
|
||||||
}
|
|
||||||
public int domainCount() {
|
|
||||||
return domainCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
|
||||||
volatile static boolean running = true;
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static void main(String... args) {
|
|
||||||
org.mariadb.jdbc.Driver driver = new Driver();
|
|
||||||
var conn = new DatabaseModule().provideConnection();
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
|
||||||
|
|
||||||
logger.info("Ranking");
|
|
||||||
var ds = new DatabaseModule().provideConnection();
|
|
||||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
|
||||||
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
|
|
||||||
|
|
||||||
rankMax = spr.size()*2;
|
|
||||||
uploader.start();
|
|
||||||
|
|
||||||
var rankData = spr.pageRankWithPeripheralNodes(rankMax);
|
|
||||||
for (int i : rankData) {
|
|
||||||
try {
|
|
||||||
uploadQueue.put(i);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
long end = System.currentTimeMillis();
|
|
||||||
running = false;
|
|
||||||
uploader.join();
|
|
||||||
|
|
||||||
logger.info("Done in {}", (end - start)/1000.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void uploadThread(HikariDataSource dataSource) {
|
|
||||||
int i = 0;
|
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection()) {
|
|
||||||
logger.info("Resetting rank");
|
|
||||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) {
|
|
||||||
stmt.executeUpdate();
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Updating ranks");
|
|
||||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) {
|
|
||||||
while (running || (!running && !uploadQueue.isEmpty())) {
|
|
||||||
var job = uploadQueue.take();
|
|
||||||
stmt.setDouble(1, i++ / (double) rankMax);
|
|
||||||
stmt.setInt(2, job);
|
|
||||||
stmt.executeUpdate();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (SQLException | InterruptedException throwables) {
|
|
||||||
throwables.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index;
|
|||||||
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@ -9,14 +10,16 @@ import java.io.IOException;
|
|||||||
public class EdgeIndexControl {
|
public class EdgeIndexControl {
|
||||||
|
|
||||||
private final IndexServicesFactory servicesFactory;
|
private final IndexServicesFactory servicesFactory;
|
||||||
|
private final EdgeIndexSearchSetsService searchSetsService;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeIndexControl(IndexServicesFactory servicesFactory) {
|
public EdgeIndexControl(IndexServicesFactory servicesFactory, EdgeIndexSearchSetsService searchSetsService) {
|
||||||
this.servicesFactory = servicesFactory;
|
this.servicesFactory = servicesFactory;
|
||||||
|
this.searchSetsService = searchSetsService;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void regenerateIndex() throws IOException {
|
public void regenerateIndex() throws IOException {
|
||||||
servicesFactory.convertIndex();
|
servicesFactory.convertIndex(searchSetsService.getDomainRankings());
|
||||||
|
|
||||||
System.gc();
|
System.gc();
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist;
|
|||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndex;
|
import nu.marginalia.wmsa.edge.index.postings.SearchIndex;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader;
|
import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter;
|
import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter;
|
||||||
@ -20,6 +21,7 @@ import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter;
|
|||||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader;
|
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters;
|
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader;
|
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -33,7 +35,6 @@ import java.util.concurrent.Callable;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class IndexServicesFactory {
|
public class IndexServicesFactory {
|
||||||
private final Path tmpFileDir;
|
private final Path tmpFileDir;
|
||||||
private final EdgeDomainBlacklist domainBlacklist;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@ -58,12 +59,10 @@ public class IndexServicesFactory {
|
|||||||
public IndexServicesFactory(
|
public IndexServicesFactory(
|
||||||
@Named("tmp-file-dir") Path tmpFileDir,
|
@Named("tmp-file-dir") Path tmpFileDir,
|
||||||
@Named("partition-root-slow") Path partitionRootSlow,
|
@Named("partition-root-slow") Path partitionRootSlow,
|
||||||
@Named("partition-root-fast") Path partitionRootFast,
|
@Named("partition-root-fast") Path partitionRootFast
|
||||||
EdgeDomainBlacklist domainBlacklist
|
|
||||||
) throws IOException {
|
) throws IOException {
|
||||||
|
|
||||||
this.tmpFileDir = tmpFileDir;
|
this.tmpFileDir = tmpFileDir;
|
||||||
this.domainBlacklist = domainBlacklist;
|
|
||||||
|
|
||||||
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
|
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
|
||||||
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat");
|
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat");
|
||||||
@ -106,8 +105,8 @@ public class IndexServicesFactory {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void convertIndex() throws IOException {
|
public void convertIndex(DomainRankings domainRankings) throws IOException {
|
||||||
convertForwardIndex();
|
convertForwardIndex(domainRankings);
|
||||||
convertFullReverseIndex();
|
convertFullReverseIndex();
|
||||||
convertPriorityReverseIndex();
|
convertPriorityReverseIndex();
|
||||||
|
|
||||||
@ -148,13 +147,14 @@ public class IndexServicesFactory {
|
|||||||
tryGc();
|
tryGc();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void convertForwardIndex() throws IOException {
|
private void convertForwardIndex(DomainRankings domainRankings) throws IOException {
|
||||||
logger.info("Converting forward index data");
|
logger.info("Converting forward index data");
|
||||||
|
|
||||||
new ForwardIndexConverter(tmpFileDir,
|
new ForwardIndexConverter(
|
||||||
writerIndexFile.get(0),
|
writerIndexFile.get(0),
|
||||||
fwdIndexDocId.get(NEXT_PART).toPath(),
|
fwdIndexDocId.get(NEXT_PART).toPath(),
|
||||||
fwdIndexDocData.get(NEXT_PART).toPath())
|
fwdIndexDocData.get(NEXT_PART).toPath(),
|
||||||
|
domainRankings)
|
||||||
.convert();
|
.convert();
|
||||||
|
|
||||||
tryGc();
|
tryGc();
|
||||||
@ -212,8 +212,8 @@ public class IndexServicesFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchIndex createIndexBucket() {
|
public SearchIndex createIndexBucket(EdgeIndexSearchSetsService searchSetsService) {
|
||||||
return new SearchIndex(this, new EdgeIndexControl(this));
|
return new SearchIndex(this, new EdgeIndexControl(this, searchSetsService));
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchIndexReader getSearchIndexReader() throws IOException {
|
public SearchIndexReader getSearchIndexReader() throws IOException {
|
||||||
|
@ -8,7 +8,8 @@ import java.util.Set;
|
|||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
public record EdgePageDocumentsMetadata(int encSize,
|
public record EdgePageDocumentsMetadata(int rank,
|
||||||
|
int encSize,
|
||||||
int topology,
|
int topology,
|
||||||
int year,
|
int year,
|
||||||
int sets,
|
int sets,
|
||||||
@ -16,9 +17,13 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
byte flags) {
|
byte flags) {
|
||||||
|
|
||||||
|
|
||||||
|
public static final long RANK_MASK = 0xFFL;
|
||||||
|
public static final int RANK_SHIFT = 48;
|
||||||
|
|
||||||
public static final long ENCSIZE_MASK = 0xFFL;
|
public static final long ENCSIZE_MASK = 0xFFL;
|
||||||
public static final int ENCSIZE_SHIFT = 48;
|
public static final int ENCSIZE_SHIFT = 40;
|
||||||
public static final int ENCSIZE_MULTIPLIER = 50;
|
public static final int ENCSIZE_MULTIPLIER = 50;
|
||||||
|
|
||||||
public static final long TOPOLOGY_MASK = 0xFFL;
|
public static final long TOPOLOGY_MASK = 0xFFL;
|
||||||
|
|
||||||
public static final int TOPOLOGY_SHIFT = 32;
|
public static final int TOPOLOGY_SHIFT = 32;
|
||||||
@ -39,7 +44,7 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
this(defaultValue());
|
this(defaultValue());
|
||||||
}
|
}
|
||||||
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
||||||
this(0, topology, year, sets, quality, encodeFlags(flags));
|
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageDocumentsMetadata withSize(int size) {
|
public EdgePageDocumentsMetadata withSize(int size) {
|
||||||
@ -49,7 +54,7 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
|
|
||||||
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
|
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
|
||||||
|
|
||||||
return new EdgePageDocumentsMetadata(encSize, topology, year, sets, quality, flags);
|
return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
|
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
|
||||||
@ -63,7 +68,8 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageDocumentsMetadata(long value) {
|
public EdgePageDocumentsMetadata(long value) {
|
||||||
this( (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
this( (int) ((value >>> RANK_SHIFT) & RANK_MASK),
|
||||||
|
(int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
||||||
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
|
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
|
||||||
(int) ((value >>> YEAR_SHIFT) & YEAR_MASK),
|
(int) ((value >>> YEAR_SHIFT) & YEAR_MASK),
|
||||||
(int) ((value >>> SETS_SHIFT) & SETS_MASK),
|
(int) ((value >>> SETS_SHIFT) & SETS_MASK),
|
||||||
@ -84,12 +90,13 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT;
|
ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT;
|
||||||
ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT;
|
ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT;
|
||||||
ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT;
|
ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT;
|
||||||
|
ret |= min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0;
|
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0 && rank == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int decodeQuality(long encoded) {
|
public static int decodeQuality(long encoded) {
|
||||||
@ -112,6 +119,12 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK);
|
return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int decodeRank(long encoded) {
|
||||||
|
return (int) ((encoded >>> RANK_SHIFT) & RANK_MASK);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long encodeRank(long encoded, int rank) {
|
||||||
|
return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,43 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.postings;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||||
|
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import static java.lang.Math.max;
|
||||||
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
|
public class DomainRankings {
|
||||||
|
private final Int2ShortOpenHashMap rankings;
|
||||||
|
|
||||||
|
private final int MAX_MEANINGFUL_RANK = 50_000;
|
||||||
|
private final int MAX_RANK_VALUE = 255;
|
||||||
|
private final int MIN_RANK_VALUE = 1;
|
||||||
|
private final double RANK_SCALING_FACTOR = (double) MAX_RANK_VALUE / MAX_MEANINGFUL_RANK;
|
||||||
|
|
||||||
|
public DomainRankings() {
|
||||||
|
rankings = new Int2ShortOpenHashMap();
|
||||||
|
}
|
||||||
|
public DomainRankings(Int2IntOpenHashMap values) {
|
||||||
|
rankings = new Int2ShortOpenHashMap(values.size());
|
||||||
|
values.forEach(this::putRanking);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void putRanking(int domainId, int value) {
|
||||||
|
rankings.put(domainId, scaleRank(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
private short scaleRank(int value) {
|
||||||
|
double rankScaled = RANK_SCALING_FACTOR * value;
|
||||||
|
return (short) min(MAX_RANK_VALUE, max(MIN_RANK_VALUE, rankScaled));
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRanking(int domainId) {
|
||||||
|
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return rankings.size();
|
||||||
|
}
|
||||||
|
}
|
@ -6,6 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
|
|||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl;
|
import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -26,13 +27,14 @@ public class SearchIndexControl {
|
|||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchIndexControl(IndexServicesFactory servicesFactory,
|
public SearchIndexControl(IndexServicesFactory servicesFactory,
|
||||||
EdgeOpsLockService opsLockService) {
|
EdgeOpsLockService opsLockService,
|
||||||
|
EdgeIndexSearchSetsService searchSetsService) {
|
||||||
this.servicesFactory = servicesFactory;
|
this.servicesFactory = servicesFactory;
|
||||||
|
|
||||||
this.primaryIndexWriter = servicesFactory.getIndexWriter(0);
|
this.primaryIndexWriter = servicesFactory.getIndexWriter(0);
|
||||||
this.secondaryIndexWriter = servicesFactory.getIndexWriter(1);
|
this.secondaryIndexWriter = servicesFactory.getIndexWriter(1);
|
||||||
|
|
||||||
index = servicesFactory.createIndexBucket();
|
index = servicesFactory.createIndexBucket(searchSetsService);
|
||||||
this.opsLockService = opsLockService;
|
this.opsLockService = opsLockService;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.index.postings.forward;
|
|||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||||
import nu.marginalia.util.array.LongArray;
|
import nu.marginalia.util.array.LongArray;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
|
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
||||||
import org.roaringbitmap.IntConsumer;
|
import org.roaringbitmap.IntConsumer;
|
||||||
@ -18,26 +20,26 @@ import java.nio.file.Path;
|
|||||||
import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*;
|
import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*;
|
||||||
|
|
||||||
public class ForwardIndexConverter {
|
public class ForwardIndexConverter {
|
||||||
private static final int RWF_BIN_SIZE = 10_000_000;
|
|
||||||
|
|
||||||
private final Path tmpFileDir;
|
|
||||||
private final File inputFile;
|
private final File inputFile;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final Path outputFileDocsId;
|
private final Path outputFileDocsId;
|
||||||
private final Path outputFileDocsData;
|
private final Path outputFileDocsData;
|
||||||
|
private final DomainRankings domainRankings;
|
||||||
|
|
||||||
|
|
||||||
public ForwardIndexConverter(Path tmpFileDir,
|
public ForwardIndexConverter(
|
||||||
File inputFile,
|
File inputFile,
|
||||||
Path outputFileDocsId,
|
Path outputFileDocsId,
|
||||||
Path outputFileDocsData
|
Path outputFileDocsData,
|
||||||
|
DomainRankings domainRankings
|
||||||
) {
|
) {
|
||||||
this.tmpFileDir = tmpFileDir;
|
|
||||||
this.inputFile = inputFile;
|
this.inputFile = inputFile;
|
||||||
this.outputFileDocsId = outputFileDocsId;
|
this.outputFileDocsId = outputFileDocsId;
|
||||||
this.outputFileDocsData = outputFileDocsData;
|
this.outputFileDocsData = outputFileDocsData;
|
||||||
|
this.domainRankings = domainRankings;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void convert() throws IOException {
|
public void convert() throws IOException {
|
||||||
@ -50,6 +52,8 @@ public class ForwardIndexConverter {
|
|||||||
|
|
||||||
logger.info("Converting {} {}",inputFile, journalReader.fileHeader);
|
logger.info("Converting {} {}",inputFile, journalReader.fileHeader);
|
||||||
|
|
||||||
|
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
|
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
|
||||||
|
|
||||||
@ -68,7 +72,10 @@ public class ForwardIndexConverter {
|
|||||||
journalReader.forEach(entry -> {
|
journalReader.forEach(entry -> {
|
||||||
long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
||||||
|
|
||||||
docFileData.set(entryOffset + METADATA_OFFSET, entry.docMeta());
|
int ranking = domainRankings.getRanking(entry.domainId());
|
||||||
|
long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking);
|
||||||
|
|
||||||
|
docFileData.set(entryOffset + METADATA_OFFSET, meta);
|
||||||
docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId());
|
docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId());
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -33,6 +33,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
if (!validateSize(post)) {
|
if (!validateSize(post)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!validateRank(post)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -51,6 +56,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
|
|
||||||
return limit.test(quality);
|
return limit.test(quality);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean validateYear(ForwardIndexReader.DocPost post) {
|
private boolean validateYear(ForwardIndexReader.DocPost post) {
|
||||||
if (params.year().type() == SpecificationLimitType.NONE)
|
if (params.year().type() == SpecificationLimitType.NONE)
|
||||||
return true;
|
return true;
|
||||||
@ -69,6 +75,15 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
return params.size().test(postVal);
|
return params.size().test(postVal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean validateRank(ForwardIndexReader.DocPost post) {
|
||||||
|
if (params.rank().type() == SpecificationLimitType.NONE)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta());
|
||||||
|
|
||||||
|
return params.rank().test(postVal);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double cost() {
|
public double cost() {
|
||||||
return 32;
|
return 32;
|
||||||
|
@ -53,6 +53,11 @@ public class ReverseIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) {
|
public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) {
|
||||||
|
if (null == words) {
|
||||||
|
logger.warn("Reverse index is not ready, dropping query");
|
||||||
|
return new EmptyEntrySource();
|
||||||
|
}
|
||||||
|
|
||||||
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
long offset = words.get(wordId);
|
||||||
|
@ -7,6 +7,7 @@ import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
|
|||||||
public record IndexQueryParams(SpecificationLimit qualityLimit,
|
public record IndexQueryParams(SpecificationLimit qualityLimit,
|
||||||
SpecificationLimit year,
|
SpecificationLimit year,
|
||||||
SpecificationLimit size,
|
SpecificationLimit size,
|
||||||
|
SpecificationLimit rank,
|
||||||
SearchSet searchSet,
|
SearchSet searchSet,
|
||||||
QueryStrategy queryStrategy
|
QueryStrategy queryStrategy
|
||||||
)
|
)
|
||||||
|
@ -1,21 +1,19 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking;
|
||||||
|
|
||||||
import gnu.trove.list.TIntList;
|
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
import gnu.trove.map.hash.TIntIntHashMap;
|
||||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator;
|
||||||
import org.roaringbitmap.RoaringBitmap;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.function.IntToDoubleFunction;
|
import java.util.function.Supplier;
|
||||||
import java.util.stream.IntStream;
|
|
||||||
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
public abstract class RankingAlgorithm {
|
public abstract class RankingAlgorithm {
|
||||||
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
||||||
@ -133,29 +131,7 @@ public abstract class RankingAlgorithm {
|
|||||||
return domainsById.size();
|
return domainsById.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||||
public RankVector pageRankVector() {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
|
||||||
|
|
||||||
int iter_max = 100;
|
|
||||||
for (int i = 0; i < iter_max; i++) {
|
|
||||||
RankVector newRank = createNewRankVector(rank);
|
|
||||||
|
|
||||||
double oldNorm = rank.norm();
|
|
||||||
double newNorm = newRank.norm();
|
|
||||||
double dNorm = oldNorm - newNorm ;
|
|
||||||
if (i < iter_max-1) {
|
|
||||||
adjustRankVector(newRank, dNorm, oldNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
return rank;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public RoaringBitmap pageRank(int resultCount) {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||||
|
|
||||||
int iter_max = 100;
|
int iter_max = 100;
|
||||||
@ -174,10 +150,10 @@ public abstract class RankingAlgorithm {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return rank.getRanking(resultCount);
|
return rank.getRanking(resultCount, accumulatorP).get();
|
||||||
}
|
}
|
||||||
|
|
||||||
public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) {
|
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||||
|
|
||||||
int iter_max = 100;
|
int iter_max = 100;
|
||||||
@ -201,32 +177,11 @@ public abstract class RankingAlgorithm {
|
|||||||
|
|
||||||
logger.info("PRWPN iteration done");
|
logger.info("PRWPN iteration done");
|
||||||
|
|
||||||
return rank.getRanking(resultCount);
|
return rank.getRanking(resultCount, accumulatorP).get();
|
||||||
}
|
}
|
||||||
|
|
||||||
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
|
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
|
||||||
|
|
||||||
public TIntList pageRank(IntToDoubleFunction weight, int resultCount) {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
|
||||||
|
|
||||||
int iter_max = 100;
|
|
||||||
for (int i = 0; i < iter_max; i++) {
|
|
||||||
RankVector newRank = createNewRankVector(rank);
|
|
||||||
|
|
||||||
double oldNorm = rank.norm();
|
|
||||||
double newNorm = newRank.norm();
|
|
||||||
double dNorm = oldNorm - newNorm ;
|
|
||||||
|
|
||||||
if (i < iter_max-1) {
|
|
||||||
adjustRankVector(newRank, dNorm, oldNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
return rank.getRanking(weight, resultCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
abstract RankVector createNewRankVector(RankVector rank);
|
abstract RankVector createNewRankVector(RankVector rank);
|
||||||
|
|
||||||
public boolean includeInRanking(RankingDomainData data) {
|
public boolean includeInRanking(RankingDomainData data) {
|
||||||
@ -271,9 +226,8 @@ public abstract class RankingAlgorithm {
|
|||||||
|
|
||||||
public double norm() {
|
public double norm() {
|
||||||
double v = 0.;
|
double v = 0.;
|
||||||
for (int i = 0; i < rank.length; i++) {
|
for (double value : rank) {
|
||||||
if (rank[i] > 0) { v+=rank[i]; }
|
v += Math.abs(value);
|
||||||
else { v -= rank[i]; }
|
|
||||||
}
|
}
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
@ -281,73 +235,38 @@ public abstract class RankingAlgorithm {
|
|||||||
public double norm(RankVector other) {
|
public double norm(RankVector other) {
|
||||||
double v = 0.;
|
double v = 0.;
|
||||||
for (int i = 0; i < rank.length; i++) {
|
for (int i = 0; i < rank.length; i++) {
|
||||||
double dv = rank[i] - other.get(i);
|
v += Math.abs(rank[i] - other.get(i));
|
||||||
|
|
||||||
if (dv > 0) { v+=dv; }
|
|
||||||
else { v -= dv; }
|
|
||||||
}
|
}
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TIntList getRanking(IntToDoubleFunction other, int numResults) {
|
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||||
TIntArrayList list = new TIntArrayList(numResults);
|
|
||||||
|
|
||||||
Comparator<Integer> comparator = Comparator.comparing(i -> Math.sqrt(other.applyAsDouble(domainIdToIndex.get(i)) * rank[i]));
|
|
||||||
|
|
||||||
IntStream.range(0, rank.length)
|
|
||||||
.boxed()
|
|
||||||
.sorted(comparator.reversed())
|
|
||||||
.map(domainIndexToId::get)
|
|
||||||
.limit(numResults)
|
|
||||||
.forEach(list::add);
|
|
||||||
|
|
||||||
return list;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RoaringBitmap getRanking(int numResults) {
|
|
||||||
if (numResults < 0) {
|
if (numResults < 0) {
|
||||||
numResults = domainIdToIndex.size();
|
numResults = domainIdToIndex.size();
|
||||||
}
|
}
|
||||||
if (numResults >= rank.length) {
|
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
|
||||||
numResults = rank.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
RoaringBitmap list = new RoaringBitmap();
|
int[] nodes = sortOrder(rank);
|
||||||
|
var accumulator = accumulatorP.get();
|
||||||
|
|
||||||
int[] nodes = new int[rank.length];
|
for (int i = 0; i < numResults; i++) {
|
||||||
Arrays.setAll(nodes, i->i);
|
|
||||||
IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]);
|
|
||||||
IntArrays.quickSort(nodes, comp);
|
|
||||||
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < numResults; i++) {
|
|
||||||
int id = domainIndexToId.get(nodes[i]);
|
int id = domainIndexToId.get(nodes[i]);
|
||||||
|
|
||||||
if (includeInRanking(domainsById.get(id)))
|
if (includeInRanking(domainsById.get(id)))
|
||||||
list.add(id);
|
accumulator.add(id, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; i < nodes.length && domainsById.size() < numResults; i++) {
|
return accumulator;
|
||||||
int id = domainIndexToId.get(nodes[i]);
|
|
||||||
|
|
||||||
if (includeInRanking(domainsById.get(id)))
|
|
||||||
list.add(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return list;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int[] sortOrder(double[] values) {
|
||||||
|
|
||||||
public void incrementAll(double v) {
|
int[] ret = new int[values.length];
|
||||||
for (int i = 0; i < rank.length; i++) {
|
Arrays.setAll(ret, i->i);
|
||||||
rank[i]+=v;
|
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int size() {
|
return ret;
|
||||||
return domainsById.size();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,10 +1,10 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking;
|
||||||
|
|
||||||
|
|
||||||
public class BetterReversePageRank extends RankingAlgorithm {
|
public class ReversePageRank extends RankingAlgorithm {
|
||||||
|
|
||||||
|
|
||||||
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
|
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||||
super(domains, origins);
|
super(domains, origins);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -20,8 +20,6 @@ public class BetterReversePageRank extends RankingAlgorithm {
|
|||||||
double newRankValue = 0;
|
double newRankValue = 0;
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
if (links != null && links.size() > 0) {
|
||||||
|
|
||||||
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
for (int j = 0; j < links.size(); j++) {
|
||||||
var revLinks = linkDataDest2Src[links.getQuick(j)];
|
var revLinks = linkDataDest2Src[links.getQuick(j)];
|
||||||
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
|
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
|
@ -1,9 +1,9 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking;
|
||||||
|
|
||||||
|
|
||||||
public class BetterStandardPageRank extends RankingAlgorithm {
|
public class StandardPageRank extends RankingAlgorithm {
|
||||||
|
|
||||||
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||||
super(domains, origins);
|
super(domains, origins);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -38,8 +38,7 @@ public class BetterStandardPageRank extends RankingAlgorithm {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
||||||
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() /* dNorm/originDomainIds.size() */ ));
|
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
|
||||||
// vector.incrementAll(0.14*dNorm/vector.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,6 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||||
|
|
||||||
|
public interface RankingResultAccumulator<T> {
|
||||||
|
void add(int domainId, int rank);
|
||||||
|
T get();
|
||||||
|
}
|
@ -0,0 +1,17 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||||
|
|
||||||
|
import org.roaringbitmap.RoaringBitmap;
|
||||||
|
|
||||||
|
public class RankingResultBitSetAccumulator implements RankingResultAccumulator<RoaringBitmap> {
|
||||||
|
private final RoaringBitmap result = new RoaringBitmap();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void add(int domainId, int rank) {
|
||||||
|
result.add(domainId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public RoaringBitmap get() {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,21 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||||
|
|
||||||
|
public class RankingResultHashMapAccumulator implements RankingResultAccumulator<Int2IntOpenHashMap> {
|
||||||
|
private final Int2IntOpenHashMap result;
|
||||||
|
|
||||||
|
public RankingResultHashMapAccumulator(int size) {
|
||||||
|
result = new Int2IntOpenHashMap(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void add(int domainId, int rank) {
|
||||||
|
result.put(domainId, rank);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Int2IntOpenHashMap get() {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||||
|
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
|
|
||||||
|
public class RankingResultListAccumulator implements RankingResultAccumulator<TIntArrayList> {
|
||||||
|
private final TIntArrayList result;
|
||||||
|
|
||||||
|
public RankingResultListAccumulator(int size) {
|
||||||
|
result = new TIntArrayList(size);
|
||||||
|
}
|
||||||
|
public RankingResultListAccumulator() {
|
||||||
|
result = new TIntArrayList(10_000);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void add(int domainId, int rank) {
|
||||||
|
result.add(domainId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TIntArrayList get() {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util.ranking.old;
|
package nu.marginalia.wmsa.edge.index.ranking.old;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util.ranking.old;
|
package nu.marginalia.wmsa.edge.index.ranking.old;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
@ -125,7 +125,6 @@ public class StandardPageRank {
|
|||||||
|
|
||||||
final TIntArrayList empty = new TIntArrayList();
|
final TIntArrayList empty = new TIntArrayList();
|
||||||
|
|
||||||
double rankNorm = rank.norm();
|
|
||||||
RankVector newRank = new RankVector(0);
|
RankVector newRank = new RankVector(0);
|
||||||
|
|
||||||
for (DomainData domain : domains.valueCollection()) {
|
for (DomainData domain : domains.valueCollection()) {
|
||||||
@ -176,8 +175,6 @@ public class StandardPageRank {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
TIntHashSet deadEnds = new TIntHashSet(domains.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private class RankVector {
|
private class RankVector {
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util.ranking.tool;
|
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
@ -10,9 +10,9 @@ import it.unimi.dsi.fastutil.ints.IntArrays;
|
|||||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.RankingAlgorithm;
|
import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm;
|
||||||
import nu.marginalia.util.ranking.RankingDomainData;
|
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainData;
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
@ -33,8 +33,6 @@ public class PerusePageRankV2 {
|
|||||||
TIntArrayList[] linkDataSrc2Dest;
|
TIntArrayList[] linkDataSrc2Dest;
|
||||||
TIntArrayList[] linkDataDest2Src;
|
TIntArrayList[] linkDataDest2Src;
|
||||||
|
|
||||||
private static final boolean getNames = true;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
static final LinkedBlockingQueue<LinkAdjacencies> uploadQueue = new LinkedBlockingQueue<>(10);
|
static final LinkedBlockingQueue<LinkAdjacencies> uploadQueue = new LinkedBlockingQueue<>(10);
|
@ -1,9 +1,10 @@
|
|||||||
package nu.marginalia.util.ranking.tool;
|
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
@ -17,8 +18,6 @@ public class UpdateDomainRanksTool2 {
|
|||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
|
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
|
||||||
|
|
||||||
public final long domainIdMax = -1;
|
|
||||||
public int domainCount;
|
|
||||||
private volatile static int rankMax;
|
private volatile static int rankMax;
|
||||||
|
|
||||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||||
@ -35,20 +34,21 @@ public class UpdateDomainRanksTool2 {
|
|||||||
logger.info("Ranking");
|
logger.info("Ranking");
|
||||||
var ds = new DatabaseModule().provideConnection();
|
var ds = new DatabaseModule().provideConnection();
|
||||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
var rpr = new ReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||||
|
|
||||||
var rankVector = rpr.pageRankVector();
|
|
||||||
rankMax = rpr.size();
|
rankMax = rpr.size();
|
||||||
uploader.start();
|
uploader.start();
|
||||||
|
|
||||||
var rankData = rpr.pageRankWithPeripheralNodes(rankMax);
|
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
|
||||||
for (int i : rankData) {
|
|
||||||
|
rankData.forEach(i -> {
|
||||||
try {
|
try {
|
||||||
uploadQueue.put(i);
|
uploadQueue.put(i);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
long end = System.currentTimeMillis();
|
long end = System.currentTimeMillis();
|
||||||
running = false;
|
running = false;
|
@ -129,6 +129,7 @@ public class EdgeIndexQueryService {
|
|||||||
specsSet.quality,
|
specsSet.quality,
|
||||||
specsSet.year,
|
specsSet.year,
|
||||||
specsSet.size,
|
specsSet.size,
|
||||||
|
specsSet.rank,
|
||||||
getSearchSet(specsSet),
|
getSearchSet(specsSet),
|
||||||
specsSet.queryStrategy);
|
specsSet.queryStrategy);
|
||||||
}
|
}
|
||||||
|
@ -2,51 +2,43 @@ package nu.marginalia.wmsa.edge.index.svc;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import gnu.trove.list.TIntList;
|
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||||
import org.roaringbitmap.RoaringBitmap;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeIndexSearchSetsService {
|
public class EdgeIndexSearchSetsService {
|
||||||
private final HikariDataSource dataSource;
|
private final RankingDomainFetcher rankingDomains;
|
||||||
private RankingDomainFetcher rankingDomains;
|
|
||||||
private final RankingSettings rankingSettings;
|
private final RankingSettings rankingSettings;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private final SearchSet anySet = new SearchSetAny();
|
private final SearchSet anySet = new SearchSetAny();
|
||||||
private volatile RankingSearchSet retroSet;
|
private volatile RankingSearchSet retroSet;
|
||||||
private volatile RankingSearchSet smallWebSet;
|
private volatile RankingSearchSet smallWebSet;
|
||||||
private volatile RankingSearchSet academiaSet;
|
private volatile RankingSearchSet academiaSet;
|
||||||
|
|
||||||
|
private volatile DomainRankings domainRankings = new DomainRankings();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeIndexSearchSetsService(HikariDataSource dataSource,
|
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
|
||||||
RankingDomainFetcher rankingDomains,
|
|
||||||
RankingSettings rankingSettings,
|
RankingSettings rankingSettings,
|
||||||
IndexServicesFactory servicesFactory) throws IOException {
|
IndexServicesFactory servicesFactory) throws IOException {
|
||||||
this.dataSource = dataSource;
|
|
||||||
this.rankingDomains = rankingDomains;
|
this.rankingDomains = rankingDomains;
|
||||||
this.rankingSettings = rankingSettings;
|
this.rankingSettings = rankingSettings;
|
||||||
|
|
||||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
||||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
|
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
|
||||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
||||||
|
|
||||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void recalculateAll() {
|
public void recalculateAll() {
|
||||||
@ -55,52 +47,27 @@ public class EdgeIndexSearchSetsService {
|
|||||||
updateSmallWebDomains();
|
updateSmallWebDomains();
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public RoaringBitmap goodUrls() {
|
|
||||||
RoaringBitmap domains = new RoaringBitmap();
|
|
||||||
RoaringBitmap urls = new RoaringBitmap();
|
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
|
|
||||||
stmt.setFetchSize(10_000);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
domains.add(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// For some reason, doing this "INNER JOIN" in Java is significantly faster than doing it in SQL
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID,DOMAIN_ID FROM EC_URL WHERE VISITED AND EC_URL.STATE='OK'")) {
|
|
||||||
stmt.setFetchSize(10_000);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
if (domains.contains(rsp.getInt(2))) {
|
|
||||||
urls.add(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return urls;
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void updateRetroDomains() {
|
public void updateRetroDomains() {
|
||||||
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
var spr = new StandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
||||||
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2);
|
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
|
||||||
|
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||||
retroSet.write();
|
retroSet.write();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
|
||||||
|
synchronized (this) {
|
||||||
|
domainRankings = new DomainRankings(ranks);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void updateSmallWebDomains() {
|
public void updateSmallWebDomains() {
|
||||||
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
var rpr = new ReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
||||||
rpr.setMaxKnownUrls(750);
|
rpr.setMaxKnownUrls(750);
|
||||||
var data = rpr.pageRankWithPeripheralNodes(rpr.size());
|
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
|
||||||
|
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||||
@ -110,8 +77,8 @@ public class EdgeIndexSearchSetsService {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void updateAcademiaDomains() {
|
public void updateAcademiaDomains() {
|
||||||
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
var spr = new StandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
||||||
var data = spr.pageRankWithPeripheralNodes(spr.size()/2);
|
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
|
||||||
|
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||||
@ -119,41 +86,8 @@ public class EdgeIndexSearchSetsService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
public DomainRankings getDomainRankings() {
|
||||||
public TIntList getStandardDomains() {
|
return domainRankings;
|
||||||
TIntArrayList results = new TIntArrayList();
|
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection();
|
|
||||||
var stmt = connection.prepareStatement(
|
|
||||||
"""
|
|
||||||
SELECT ID FROM EC_DOMAIN
|
|
||||||
WHERE INDEXED>0
|
|
||||||
AND STATE='ACTIVE'
|
|
||||||
AND DOMAIN_ALIAS IS NULL
|
|
||||||
ORDER BY ID ASC
|
|
||||||
""");
|
|
||||||
) {
|
|
||||||
var rs = stmt.executeQuery();
|
|
||||||
while (rs.next()) {
|
|
||||||
results.add(rs.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public TIntList getSpecialDomains() {
|
|
||||||
TIntArrayList results = new TIntArrayList();
|
|
||||||
try (var connection = dataSource.getConnection();
|
|
||||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
|
|
||||||
) {
|
|
||||||
var rs = stmt.executeQuery();
|
|
||||||
while (rs.next()) {
|
|
||||||
results.add(rs.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) {
|
public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) {
|
||||||
|
@ -24,6 +24,11 @@ public record EdgeSearchResultKeywordScore(int set,
|
|||||||
sum += 20;
|
sum += 20;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13;
|
||||||
|
if (rank < 0)
|
||||||
|
sum += rank / 2;
|
||||||
|
else
|
||||||
|
sum += rank / 4;
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
@ -19,6 +19,7 @@ public class EdgeSearchSpecification {
|
|||||||
public final SpecificationLimit quality;
|
public final SpecificationLimit quality;
|
||||||
public final SpecificationLimit year;
|
public final SpecificationLimit year;
|
||||||
public final SpecificationLimit size;
|
public final SpecificationLimit size;
|
||||||
|
public final SpecificationLimit rank;
|
||||||
|
|
||||||
public final QueryLimits queryLimits;
|
public final QueryLimits queryLimits;
|
||||||
public final QueryStrategy queryStrategy;
|
public final QueryStrategy queryStrategy;
|
||||||
|
@ -97,6 +97,7 @@ public class QueryFactory {
|
|||||||
SpecificationLimit qualityLimit = profile.getQualityLimit();
|
SpecificationLimit qualityLimit = profile.getQualityLimit();
|
||||||
SpecificationLimit year = profile.getYearLimit();
|
SpecificationLimit year = profile.getYearLimit();
|
||||||
SpecificationLimit size = profile.getSizeLimit();
|
SpecificationLimit size = profile.getSizeLimit();
|
||||||
|
SpecificationLimit rank = SpecificationLimit.none();
|
||||||
|
|
||||||
for (Token t : basicQuery) {
|
for (Token t : basicQuery) {
|
||||||
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
|
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
|
||||||
@ -116,6 +117,9 @@ public class QueryFactory {
|
|||||||
if (t.type == TokenType.SIZE_TERM) {
|
if (t.type == TokenType.SIZE_TERM) {
|
||||||
size = parseSpecificationLimit(t.str);
|
size = parseSpecificationLimit(t.str);
|
||||||
}
|
}
|
||||||
|
if (t.type == TokenType.RANK_TERM) {
|
||||||
|
rank = parseSpecificationLimit(t.str);
|
||||||
|
}
|
||||||
if (t.type == TokenType.QS_TERM) {
|
if (t.type == TokenType.QS_TERM) {
|
||||||
queryStrategy = parseQueryStrategy(t.str);
|
queryStrategy = parseQueryStrategy(t.str);
|
||||||
}
|
}
|
||||||
@ -154,6 +158,8 @@ public class QueryFactory {
|
|||||||
case QUALITY_TERM:
|
case QUALITY_TERM:
|
||||||
case YEAR_TERM:
|
case YEAR_TERM:
|
||||||
case SIZE_TERM:
|
case SIZE_TERM:
|
||||||
|
case RANK_TERM:
|
||||||
|
case QS_TERM:
|
||||||
break; //
|
break; //
|
||||||
case NEAR_TERM:
|
case NEAR_TERM:
|
||||||
near = t.str;
|
near = t.str;
|
||||||
@ -199,6 +205,7 @@ public class QueryFactory {
|
|||||||
.quality(qualityLimit)
|
.quality(qualityLimit)
|
||||||
.year(year)
|
.year(year)
|
||||||
.size(size)
|
.size(size)
|
||||||
|
.rank(rank)
|
||||||
.domains(domains)
|
.domains(domains)
|
||||||
.queryStrategy(queryStrategy)
|
.queryStrategy(queryStrategy)
|
||||||
.searchSetIdentifier(profile.searchSetIdentifier);
|
.searchSetIdentifier(profile.searchSetIdentifier);
|
||||||
|
@ -93,6 +93,8 @@ public class QueryParser {
|
|||||||
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
|
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
|
||||||
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
|
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
|
||||||
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
||||||
|
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
|
||||||
|
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
|
||||||
} else if (t.str.startsWith("qs=")) {
|
} else if (t.str.startsWith("qs=")) {
|
||||||
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
|
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
|
||||||
} else if (t.str.contains(":")) {
|
} else if (t.str.contains(":")) {
|
||||||
@ -508,6 +510,7 @@ enum TokenType implements Predicate<Token> {
|
|||||||
QUALITY_TERM,
|
QUALITY_TERM,
|
||||||
YEAR_TERM,
|
YEAR_TERM,
|
||||||
SIZE_TERM,
|
SIZE_TERM,
|
||||||
|
RANK_TERM,
|
||||||
NEAR_TERM,
|
NEAR_TERM,
|
||||||
|
|
||||||
QS_TERM,
|
QS_TERM,
|
||||||
|
@ -56,6 +56,9 @@
|
|||||||
<tr><td>year=2005</td><td>(beta) The document was ostensibly published in 2005</td></tr>
|
<tr><td>year=2005</td><td>(beta) The document was ostensibly published in 2005</td></tr>
|
||||||
<tr><td>year<2005</td><td>(beta) The document was ostensibly published in or before 2005</td></tr>
|
<tr><td>year<2005</td><td>(beta) The document was ostensibly published in or before 2005</td></tr>
|
||||||
|
|
||||||
|
<tr><td>rank>50</td><td>(beta) The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
|
||||||
|
<tr><td>year<50</td><td>(beta) The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
|
||||||
|
|
||||||
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
|
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
|
||||||
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
|
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
|
||||||
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>
|
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>
|
||||||
|
@ -3,13 +3,15 @@ package nu.marginalia.wmsa.edge.index.model;
|
|||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.EnumSet;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
class EdgePageDocumentsMetadataTest {
|
class EdgePageDocumentsMetadataTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void codecYear() {
|
public void codecYear() {
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, (byte) 0);
|
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 192, 0, 0, (byte) 0);
|
||||||
long encoded = meta.encode();
|
long encoded = meta.encode();
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||||
assertEquals(192, decoded.year());
|
assertEquals(192, decoded.year());
|
||||||
@ -17,7 +19,7 @@ class EdgePageDocumentsMetadataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void codecTopology() {
|
public void codecTopology() {
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 192, 0, 0, 0, (byte) 0);
|
var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, 0, (byte) 0);
|
||||||
long encoded = meta.encode();
|
long encoded = meta.encode();
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||||
assertEquals(192, decoded.topology());
|
assertEquals(192, decoded.topology());
|
||||||
@ -25,7 +27,7 @@ class EdgePageDocumentsMetadataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void codecSets() {
|
public void codecSets() {
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 14, 0, (byte) 0);
|
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 14, 0, (byte) 0);
|
||||||
long encoded = meta.encode();
|
long encoded = meta.encode();
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||||
assertEquals(14, decoded.sets());
|
assertEquals(14, decoded.sets());
|
||||||
@ -33,7 +35,7 @@ class EdgePageDocumentsMetadataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void codecQuality() {
|
public void codecQuality() {
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 9, (byte) 0);
|
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 9, (byte) 0);
|
||||||
long encoded = meta.encode();
|
long encoded = meta.encode();
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||||
assertEquals(9, decoded.quality());
|
assertEquals(9, decoded.quality());
|
||||||
@ -41,7 +43,7 @@ class EdgePageDocumentsMetadataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void codecFlags() {
|
public void codecFlags() {
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, (byte) 255);
|
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 0, (byte) 255);
|
||||||
long encoded = meta.encode();
|
long encoded = meta.encode();
|
||||||
System.out.println(Long.toHexString(encoded));
|
System.out.println(Long.toHexString(encoded));
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
var decoded = new EdgePageDocumentsMetadata(encoded);
|
||||||
@ -57,7 +59,17 @@ class EdgePageDocumentsMetadataTest {
|
|||||||
assertEquals(50, new EdgePageDocumentsMetadata(0).withSize(4).size());
|
assertEquals(50, new EdgePageDocumentsMetadata(0).withSize(4).size());
|
||||||
assertEquals(50, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(4).encode()));
|
assertEquals(50, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(4).encode()));
|
||||||
|
|
||||||
assertEquals(50*255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode()));
|
assertEquals(50 * 255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode()));
|
||||||
assertEquals(50*255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size());
|
assertEquals(50 * 255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void encRank() {
|
||||||
|
var meta = new EdgePageDocumentsMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class))
|
||||||
|
.withSize(0xffffffff).encode();
|
||||||
|
var enc2 = EdgePageDocumentsMetadata.encodeRank(meta, 83);
|
||||||
|
|
||||||
|
assertEquals(83, EdgePageDocumentsMetadata.decodeRank(enc2));
|
||||||
|
assertEquals(5, EdgePageDocumentsMetadata.decodeTopology(enc2));
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -6,6 +6,7 @@ import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
|||||||
import nu.marginalia.util.test.TestUtil;
|
import nu.marginalia.util.test.TestUtil;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
|
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
||||||
@ -36,7 +37,6 @@ class ForwardIndexConverterTest {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
Path dataDir;
|
Path dataDir;
|
||||||
private Path wordsFile;
|
|
||||||
private Path docsFileId;
|
private Path docsFileId;
|
||||||
private Path docsFileData;
|
private Path docsFileData;
|
||||||
|
|
||||||
@ -71,7 +71,6 @@ class ForwardIndexConverterTest {
|
|||||||
|
|
||||||
var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile));
|
var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile));
|
||||||
|
|
||||||
wordsFile = dataDir.resolve("words.dat");
|
|
||||||
docsFileId = dataDir.resolve("docs-i.dat");
|
docsFileId = dataDir.resolve("docs-i.dat");
|
||||||
docsFileData = dataDir.resolve("docs-d.dat");
|
docsFileData = dataDir.resolve("docs-d.dat");
|
||||||
}
|
}
|
||||||
@ -104,18 +103,15 @@ class ForwardIndexConverterTest {
|
|||||||
@Test
|
@Test
|
||||||
void testForwardIndex() throws IOException {
|
void testForwardIndex() throws IOException {
|
||||||
|
|
||||||
Path tmpDir = Path.of("/tmp");
|
new ForwardIndexConverter(indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert();
|
||||||
|
|
||||||
new ForwardIndexConverter(tmpDir, indexFile.toFile(), docsFileId, docsFileData).convert();
|
|
||||||
|
|
||||||
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
|
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
|
||||||
|
|
||||||
for (int i = 36; i < workSetSize; i++) {
|
for (int i = 36; i < workSetSize; i++) {
|
||||||
assertEquals(i % 5, forwardReader.getDocMeta(i));
|
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i));
|
||||||
assertEquals(i/20, forwardReader.getDomainId(i));
|
assertEquals(i/20, forwardReader.getDomainId(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
TestUtil.clearTempDir(dataDir);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -82,6 +82,7 @@ public class EdgeIndexIntegrationTest {
|
|||||||
.year(SpecificationLimit.none())
|
.year(SpecificationLimit.none())
|
||||||
.quality(SpecificationLimit.none())
|
.quality(SpecificationLimit.none())
|
||||||
.size(SpecificationLimit.none())
|
.size(SpecificationLimit.none())
|
||||||
|
.rank(SpecificationLimit.none())
|
||||||
.domains(new ArrayList<>())
|
.domains(new ArrayList<>())
|
||||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||||
.subqueries(List.of(new EdgeSearchSubquery(
|
.subqueries(List.of(new EdgeSearchSubquery(
|
||||||
@ -113,6 +114,7 @@ public class EdgeIndexIntegrationTest {
|
|||||||
.year(SpecificationLimit.none())
|
.year(SpecificationLimit.none())
|
||||||
.quality(SpecificationLimit.none())
|
.quality(SpecificationLimit.none())
|
||||||
.size(SpecificationLimit.none())
|
.size(SpecificationLimit.none())
|
||||||
|
.rank(SpecificationLimit.none())
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.domains(List.of(2))
|
.domains(List.of(2))
|
||||||
.subqueries(List.of(new EdgeSearchSubquery(
|
.subqueries(List.of(new EdgeSearchSubquery(
|
||||||
@ -139,6 +141,7 @@ public class EdgeIndexIntegrationTest {
|
|||||||
.quality(SpecificationLimit.none())
|
.quality(SpecificationLimit.none())
|
||||||
.year(SpecificationLimit.equals(1998))
|
.year(SpecificationLimit.equals(1998))
|
||||||
.size(SpecificationLimit.none())
|
.size(SpecificationLimit.none())
|
||||||
|
.rank(SpecificationLimit.none())
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||||
.subqueries(List.of(new EdgeSearchSubquery(
|
.subqueries(List.of(new EdgeSearchSubquery(
|
||||||
@ -161,7 +164,7 @@ public class EdgeIndexIntegrationTest {
|
|||||||
|
|
||||||
long fullId = id | ((long) (32 - (id % 32)) << 32);
|
long fullId = id | ((long) (32 - (id % 32)) << 32);
|
||||||
|
|
||||||
var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
@ -4,6 +4,7 @@ import com.google.inject.AbstractModule;
|
|||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
import nu.marginalia.util.test.TestUtil;
|
import nu.marginalia.util.test.TestUtil;
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||||
@ -42,11 +43,12 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule {
|
|||||||
System.setProperty("small-ram", "true");
|
System.setProperty("small-ram", "true");
|
||||||
try {
|
try {
|
||||||
bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"),
|
bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"),
|
||||||
slowDir, fastDir, null
|
slowDir, fastDir
|
||||||
));
|
));
|
||||||
|
|
||||||
EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class);
|
EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class);
|
||||||
when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny());
|
when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny());
|
||||||
|
when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings());
|
||||||
|
|
||||||
bind(EdgeIndexSearchSetsService.class).toInstance(setsServiceMock);
|
bind(EdgeIndexSearchSetsService.class).toInstance(setsServiceMock);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user