mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
Clean up documentation and rename domain-links
to link-graph
This commit is contained in:
parent
3a65fe8917
commit
9f1649636e
@ -1,15 +1,3 @@
|
||||
## Domain Link Database
|
||||
|
||||
The domain link database contains information about links
|
||||
between domains. It is a static in-memory database loaded
|
||||
from a binary file.
|
||||
|
||||
* [DomainLinkDb](java/nu/marginalia/linkdb/DomainLinkDb.java)
|
||||
* * [FileDomainLinkDb](java/nu/marginalia/linkdb/FileDomainLinkDb.java)
|
||||
* * [SqlDomainLinkDb](java/nu/marginalia/linkdb/SqlDomainLinkDb.java)
|
||||
* [DomainLinkDbWriter](java/nu/marginalia/linkdb/DomainLinkDbWriter.java)
|
||||
* [DomainLinkDbLoader](java/nu/marginalia/linkdb/DomainLinkDbLoader.java)
|
||||
|
||||
## Document Database
|
||||
|
||||
The document database contains information about links,
|
||||
@ -21,10 +9,10 @@ is not in the MariaDB database is that this would make updates to
|
||||
this information take effect in production immediately, even before
|
||||
the information was searchable.
|
||||
|
||||
* [DocumentLinkDbWriter](java/nu/marginalia/linkdb/DocumentDbWriter.java)
|
||||
* [DocumentLinkDbLoader](java/nu/marginalia/linkdb/DocumentDbReader.java)
|
||||
* [DocumentLinkDbWriter](java/nu/marginalia/linkdb/docs/DocumentDbWriter.java)
|
||||
* [DocumentLinkDbLoader](java/nu/marginalia/linkdb/docs/DocumentDbReader.java)
|
||||
|
||||
|
||||
## See Also
|
||||
|
||||
These databases are constructed by the [loading-process](../../processes/loading-process), and consumed by the [index-service](../../services-core/index-service).
|
||||
The database is constructed by the [loading-process](../../processes/loading-process), and consumed by the [index-service](../../services-core/index-service).
|
@ -7,6 +7,5 @@ as shared models.
|
||||
* [config](config/) contains some `@Inject`ables.
|
||||
* [renderer](renderer/) contains utility code for rendering website templates.
|
||||
* [service](service/) is the shared base classes for main methods and web services.
|
||||
* [service-client](service-client/) is the shared base class for RPC.
|
||||
* [service-discovery](service-discovery) contains tools that lets the services find each other.
|
||||
* [service-discovery](service-discovery) contains tools that lets the services find each other and communicate.
|
||||
* [process](process/) contains boiler plate for batch processes.
|
||||
|
@ -34,7 +34,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation project(':code:functions:domain-links:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:process-models:crawl-spec')
|
||||
|
@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
@ -32,7 +32,7 @@ public class ExportDataActor extends RecordActorPrototype {
|
||||
private final FileStorageService storageService;
|
||||
private final HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final AggregateDomainLinksClient domainLinksClient;
|
||||
private final AggregateLinkGraphClient linkGraphClient;
|
||||
|
||||
public record Export() implements ActorStep {}
|
||||
public record ExportBlacklist(FileStorageId fid) implements ActorStep {}
|
||||
@ -114,7 +114,7 @@ public class ExportDataActor extends RecordActorPrototype {
|
||||
var tmpFile = Files.createTempFile(storage.asPath(), "export", ".csv.gz",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
var allLinks = domainLinksClient.getAllDomainLinks();
|
||||
var allLinks = linkGraphClient.getAllDomainLinks();
|
||||
|
||||
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))))
|
||||
{
|
||||
@ -155,12 +155,12 @@ public class ExportDataActor extends RecordActorPrototype {
|
||||
public ExportDataActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
HikariDataSource dataSource,
|
||||
AggregateDomainLinksClient domainLinksClient)
|
||||
AggregateLinkGraphClient linkGraphClient)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.dataSource = dataSource;
|
||||
this.domainLinksClient = domainLinksClient;
|
||||
this.linkGraphClient = linkGraphClient;
|
||||
}
|
||||
|
||||
}
|
||||
|
12
code/execution/readme.md
Normal file
12
code/execution/readme.md
Normal file
@ -0,0 +1,12 @@
|
||||
The execution subsystem is responsible for the execution of long running tasks on each
|
||||
index node. It lives in the [executor-service](../services-core/executor-service) module.
|
||||
|
||||
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
|
||||
which permits program state to survive crashes and reboots.
|
||||
|
||||
The subsystem exposes four [APIs](api/src/main/protobuf/executor-api.proto):
|
||||
|
||||
* Execution API - for starting and stopping tasks, also contains miscellaneous commands
|
||||
* Crawl API - for managing the crawl workflow
|
||||
* Sideload API - for sideloading data
|
||||
* Export API - for exporting data
|
@ -16,4 +16,3 @@ holistically, not by question or answer, it is necessary to re-arrange
|
||||
the data (which is very large). SQLite does a decent job of enabling
|
||||
this task.
|
||||
|
||||
See [tools/stackexchange-converter](../../tools/stackexchange-converter).
|
@ -15,7 +15,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
implementation project(':code:functions:domain-links:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.functions.domains;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.api.domains.RpcDomainInfoResponse;
|
||||
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
@ -21,7 +21,7 @@ public class DomainInformationService {
|
||||
private final GeoIpDictionary geoIpDictionary;
|
||||
|
||||
private DbDomainQueries dbDomainQueries;
|
||||
private final AggregateDomainLinksClient domainLinksClient;
|
||||
private final AggregateLinkGraphClient linkGraphClient;
|
||||
private HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@ -29,11 +29,11 @@ public class DomainInformationService {
|
||||
public DomainInformationService(
|
||||
DbDomainQueries dbDomainQueries,
|
||||
GeoIpDictionary geoIpDictionary,
|
||||
AggregateDomainLinksClient domainLinksClient,
|
||||
AggregateLinkGraphClient linkGraphClient,
|
||||
HikariDataSource dataSource) {
|
||||
this.dbDomainQueries = dbDomainQueries;
|
||||
this.geoIpDictionary = geoIpDictionary;
|
||||
this.domainLinksClient = domainLinksClient;
|
||||
this.linkGraphClient = linkGraphClient;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
@ -84,8 +84,8 @@ public class DomainInformationService {
|
||||
inCrawlQueue = rs.next();
|
||||
builder.setInCrawlQueue(inCrawlQueue);
|
||||
|
||||
builder.setIncomingLinks(domainLinksClient.countLinksToDomain(domainId));
|
||||
builder.setOutboundLinks(domainLinksClient.countLinksFromDomain(domainId));
|
||||
builder.setIncomingLinks(linkGraphClient.countLinksToDomain(domainId));
|
||||
builder.setOutboundLinks(linkGraphClient.countLinksFromDomain(domainId));
|
||||
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId}
|
||||
|
@ -11,7 +11,7 @@ import gnu.trove.set.hash.TIntHashSet;
|
||||
import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap;
|
||||
import nu.marginalia.api.domains.*;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import org.slf4j.Logger;
|
||||
@ -20,7 +20,6 @@ import org.slf4j.LoggerFactory;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.stream.IntStream;
|
||||
@ -29,7 +28,7 @@ public class SimilarDomainsService {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class);
|
||||
private final HikariDataSource dataSource;
|
||||
private final AggregateDomainLinksClient domainLinksClient;
|
||||
private final AggregateLinkGraphClient linkGraphClient;
|
||||
|
||||
private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
|
||||
private volatile int[] domainIdxToId;
|
||||
@ -45,9 +44,9 @@ public class SimilarDomainsService {
|
||||
volatile boolean isReady = false;
|
||||
|
||||
@Inject
|
||||
public SimilarDomainsService(HikariDataSource dataSource, AggregateDomainLinksClient domainLinksClient) {
|
||||
public SimilarDomainsService(HikariDataSource dataSource, AggregateLinkGraphClient linkGraphClient) {
|
||||
this.dataSource = dataSource;
|
||||
this.domainLinksClient = domainLinksClient;
|
||||
this.linkGraphClient = linkGraphClient;
|
||||
|
||||
Executors.newSingleThreadExecutor().submit(this::init);
|
||||
}
|
||||
@ -262,7 +261,7 @@ public class SimilarDomainsService {
|
||||
private TIntSet getLinkingIdsDToS(int domainIdx) {
|
||||
var items = new TIntHashSet();
|
||||
|
||||
for (int id : domainLinksClient.getLinksFromDomain(domainIdxToId[domainIdx])) {
|
||||
for (int id : linkGraphClient.getLinksFromDomain(domainIdxToId[domainIdx])) {
|
||||
items.add(domainIdToIdx.get(id));
|
||||
}
|
||||
|
||||
@ -272,7 +271,7 @@ public class SimilarDomainsService {
|
||||
private TIntSet getLinkingIdsSToD(int domainIdx) {
|
||||
var items = new TIntHashSet();
|
||||
|
||||
for (int id : domainLinksClient.getLinksToDomain(domainIdxToId[domainIdx])) {
|
||||
for (int id : linkGraphClient.getLinksToDomain(domainIdxToId[domainIdx])) {
|
||||
items.add(domainIdToIdx.get(id));
|
||||
}
|
||||
|
||||
|
@ -1,30 +0,0 @@
|
||||
package nu.marginalia.api.indexdomainlinks;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domainlink.DomainLinksApiGrpc;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@Singleton
|
||||
public class PartitionDomainLinksClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(PartitionDomainLinksClient.class);
|
||||
|
||||
private final GrpcMultiNodeChannelPool<DomainLinksApiGrpc.DomainLinksApiBlockingStub> channelPool;
|
||||
|
||||
@Inject
|
||||
public PartitionDomainLinksClient(GrpcChannelPoolFactory factory) {
|
||||
this.channelPool = factory.createMulti(
|
||||
ServiceKey.forGrpcApi(DomainLinksApiGrpc.class, ServicePartition.multi()),
|
||||
DomainLinksApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
public GrpcMultiNodeChannelPool<DomainLinksApiGrpc.DomainLinksApiBlockingStub> getChannelPool() {
|
||||
return channelPool;
|
||||
}
|
||||
|
||||
}
|
@ -14,7 +14,7 @@ java {
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:functions:domain-links:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
@ -1,20 +1,25 @@
|
||||
package nu.marginalia.functions.domainlinks;
|
||||
package nu.marginalia.linkgraph;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.domainlink.*;
|
||||
import nu.marginalia.api.indexdomainlinks.PartitionDomainLinksClient;
|
||||
import nu.marginalia.api.linkgraph.*;
|
||||
import nu.marginalia.api.linkgraph.PartitionLinkGraphClient;
|
||||
import nu.marginalia.api.linkgraph.LinkGraphApiGrpc;
|
||||
import nu.marginalia.api.linkgraph.LinkGraphApiGrpc.LinkGraphApiBlockingStub;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksApiImplBase {
|
||||
private static final Logger logger = LoggerFactory.getLogger(AggregateDomainLinksService.class);
|
||||
private final PartitionDomainLinksClient client;
|
||||
/** This class is responsible for aggregating the link graph data from the partitioned link graph
|
||||
* services.
|
||||
*/
|
||||
public class AggregateLinkGraphService extends LinkGraphApiGrpc.LinkGraphApiImplBase {
|
||||
private static final Logger logger = LoggerFactory.getLogger(AggregateLinkGraphService.class);
|
||||
private final PartitionLinkGraphClient client;
|
||||
|
||||
@Inject
|
||||
public AggregateDomainLinksService(PartitionDomainLinksClient client) {
|
||||
public AggregateLinkGraphService(PartitionLinkGraphClient client) {
|
||||
this.client = client;
|
||||
}
|
||||
|
||||
@ -22,7 +27,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
|
||||
public void getAllLinks(Empty request,
|
||||
StreamObserver<RpcDomainIdPairs> responseObserver) {
|
||||
|
||||
client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getAllLinks)
|
||||
client.getChannelPool().call(LinkGraphApiBlockingStub::getAllLinks)
|
||||
.run(Empty.getDefaultInstance())
|
||||
.forEach(iter -> iter.forEachRemaining(responseObserver::onNext));
|
||||
|
||||
@ -34,7 +39,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
|
||||
StreamObserver<RpcDomainIdList> responseObserver) {
|
||||
var rspBuilder = RpcDomainIdList.newBuilder();
|
||||
|
||||
client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getLinksFromDomain)
|
||||
client.getChannelPool().call(LinkGraphApiBlockingStub::getLinksFromDomain)
|
||||
.run(request)
|
||||
.stream()
|
||||
.map(RpcDomainIdList::getDomainIdList)
|
||||
@ -51,7 +56,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
|
||||
var rspBuilder = RpcDomainIdList.newBuilder();
|
||||
|
||||
|
||||
client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getLinksToDomain)
|
||||
client.getChannelPool().call(LinkGraphApiBlockingStub::getLinksToDomain)
|
||||
.run(request)
|
||||
.stream()
|
||||
.map(RpcDomainIdList::getDomainIdList)
|
||||
@ -65,7 +70,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
|
||||
@Override
|
||||
public void countLinksFromDomain(RpcDomainId request,
|
||||
StreamObserver<RpcDomainIdCount> responseObserver) {
|
||||
int sum = client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::countLinksFromDomain)
|
||||
int sum = client.getChannelPool().call(LinkGraphApiBlockingStub::countLinksFromDomain)
|
||||
.run(request)
|
||||
.stream()
|
||||
.mapToInt(RpcDomainIdCount::getIdCount)
|
||||
@ -81,7 +86,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
|
||||
public void countLinksToDomain(RpcDomainId request,
|
||||
StreamObserver<RpcDomainIdCount> responseObserver) {
|
||||
|
||||
int sum = client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::countLinksToDomain)
|
||||
int sum = client.getChannelPool().call(LinkGraphApiBlockingStub::countLinksToDomain)
|
||||
.run(request)
|
||||
.stream()
|
||||
.mapToInt(RpcDomainIdCount::getIdCount)
|
3
code/functions/link-graph/aggregate/readme.md
Normal file
3
code/functions/link-graph/aggregate/readme.md
Normal file
@ -0,0 +1,3 @@
|
||||
This module is responsible for aggregating the link graph from the partitioned services, and exposing a unified
|
||||
view of the link graph. It does not keep any data or state, but instead delegates to the partitioned
|
||||
services.
|
@ -11,7 +11,7 @@ java {
|
||||
}
|
||||
}
|
||||
|
||||
jar.archiveBaseName = 'index-domain-links-api'
|
||||
jar.archiveBaseName = 'link-graph-api'
|
||||
|
||||
apply from: "$rootProject.projectDir/protobuf.gradle"
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
@ -1,10 +1,8 @@
|
||||
package nu.marginalia.api.indexdomainlinks;
|
||||
package nu.marginalia.api.linkgraph;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domainlink.DomainLinksApiGrpc;
|
||||
import nu.marginalia.api.domainlink.Empty;
|
||||
import nu.marginalia.api.domainlink.RpcDomainId;
|
||||
import nu.marginalia.api.linkgraph.LinkGraphApiGrpc;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@ -17,24 +15,26 @@ import org.slf4j.LoggerFactory;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class AggregateDomainLinksClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(AggregateDomainLinksClient.class);
|
||||
import static nu.marginalia.api.linkgraph.LinkGraphApiGrpc.*;
|
||||
|
||||
private final GrpcSingleNodeChannelPool<DomainLinksApiGrpc.DomainLinksApiBlockingStub> channelPool;
|
||||
@Singleton
|
||||
public class AggregateLinkGraphClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(AggregateLinkGraphClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<LinkGraphApiBlockingStub> channelPool;
|
||||
|
||||
@Inject
|
||||
public AggregateDomainLinksClient(GrpcChannelPoolFactory factory) {
|
||||
public AggregateLinkGraphClient(GrpcChannelPoolFactory factory) {
|
||||
this.channelPool = factory.createSingle(
|
||||
ServiceKey.forGrpcApi(DomainLinksApiGrpc.class, ServicePartition.any()),
|
||||
DomainLinksApiGrpc::newBlockingStub);
|
||||
ServiceKey.forGrpcApi(LinkGraphApiGrpc.class, ServicePartition.any()),
|
||||
LinkGraphApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
|
||||
public AllLinks getAllDomainLinks() {
|
||||
AllLinks links = new AllLinks();
|
||||
|
||||
channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getAllLinks)
|
||||
channelPool.call(LinkGraphApiBlockingStub::getAllLinks)
|
||||
.run(Empty.getDefaultInstance())
|
||||
.forEachRemaining(pairs -> {
|
||||
for (int i = 0; i < pairs.getDestIdsCount(); i++) {
|
||||
@ -47,7 +47,7 @@ public class AggregateDomainLinksClient {
|
||||
|
||||
public List<Integer> getLinksToDomain(int domainId) {
|
||||
try {
|
||||
return channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getLinksToDomain)
|
||||
return channelPool.call(LinkGraphApiBlockingStub::getLinksToDomain)
|
||||
.run(RpcDomainId.newBuilder().setDomainId(domainId).build())
|
||||
.getDomainIdList()
|
||||
.stream()
|
||||
@ -62,7 +62,7 @@ public class AggregateDomainLinksClient {
|
||||
|
||||
public List<Integer> getLinksFromDomain(int domainId) {
|
||||
try {
|
||||
return channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getLinksFromDomain)
|
||||
return channelPool.call(LinkGraphApiBlockingStub::getLinksFromDomain)
|
||||
.run(RpcDomainId.newBuilder().setDomainId(domainId).build())
|
||||
.getDomainIdList()
|
||||
.stream()
|
||||
@ -78,7 +78,7 @@ public class AggregateDomainLinksClient {
|
||||
|
||||
public int countLinksToDomain(int domainId) {
|
||||
try {
|
||||
return channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::countLinksToDomain)
|
||||
return channelPool.call(LinkGraphApiBlockingStub::countLinksToDomain)
|
||||
.run(RpcDomainId.newBuilder().setDomainId(domainId).build())
|
||||
.getIdCount();
|
||||
|
||||
@ -91,7 +91,7 @@ public class AggregateDomainLinksClient {
|
||||
|
||||
public int countLinksFromDomain(int domainId) {
|
||||
try {
|
||||
return channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::countLinksFromDomain)
|
||||
return channelPool.call(LinkGraphApiBlockingStub::countLinksFromDomain)
|
||||
.run(RpcDomainId.newBuilder().setDomainId(domainId).build())
|
||||
.getIdCount();
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
package nu.marginalia.api.linkgraph;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@Singleton
|
||||
public class PartitionLinkGraphClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(PartitionLinkGraphClient.class);
|
||||
|
||||
private final GrpcMultiNodeChannelPool<LinkGraphApiGrpc.LinkGraphApiBlockingStub> channelPool;
|
||||
|
||||
@Inject
|
||||
public PartitionLinkGraphClient(GrpcChannelPoolFactory factory) {
|
||||
this.channelPool = factory.createMulti(
|
||||
ServiceKey.forGrpcApi(LinkGraphApiGrpc.class, ServicePartition.multi()),
|
||||
LinkGraphApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
public GrpcMultiNodeChannelPool<LinkGraphApiGrpc.LinkGraphApiBlockingStub> getChannelPool() {
|
||||
return channelPool;
|
||||
}
|
||||
|
||||
}
|
@ -1,10 +1,10 @@
|
||||
syntax="proto3";
|
||||
package nu.marginalia.api.domainlinks;
|
||||
package nu.marginalia.api.linkgraph;
|
||||
|
||||
option java_package="nu.marginalia.api.domainlink";
|
||||
option java_package="nu.marginalia.api.linkgraph";
|
||||
option java_multiple_files=true;
|
||||
|
||||
service DomainLinksApi {
|
||||
service LinkGraphApi {
|
||||
rpc getAllLinks(Empty) returns (stream RpcDomainIdPairs) {}
|
||||
rpc getLinksFromDomain(RpcDomainId) returns (RpcDomainIdList) {}
|
||||
rpc getLinksToDomain(RpcDomainId) returns (RpcDomainIdList) {}
|
@ -14,7 +14,7 @@ java {
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:functions:domain-links:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
@ -1,13 +1,13 @@
|
||||
package nu.marginalia.linkdb.dlinks;
|
||||
package nu.marginalia.linkgraph;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
/** A database of source-destination pairs of domain IDs. The database is loaded into memory from
|
||||
* a source. The database is then kept in memory, reloading it upon switchInput().
|
||||
/** A repository of source-destination pairs of domain IDs. The database is loaded into memory from
|
||||
* a source. The data is then kept in memory, reloading it upon switchInput().
|
||||
*/
|
||||
public interface DomainLinkDb {
|
||||
public interface DomainLinks {
|
||||
/** Replace the current db file with the provided file. The provided file will be deleted.
|
||||
* The in-memory database MAY be updated to reflect the change.
|
||||
* */
|
@ -1,25 +1,28 @@
|
||||
package nu.marginalia.functions.domainlinks;
|
||||
package nu.marginalia.linkgraph;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.domainlink.*;
|
||||
import nu.marginalia.linkdb.dlinks.DomainLinkDb;
|
||||
import nu.marginalia.api.linkgraph.*;
|
||||
import nu.marginalia.api.linkgraph.Empty;
|
||||
import nu.marginalia.api.linkgraph.LinkGraphApiGrpc;
|
||||
|
||||
/** GRPC service for interrogating domain links
|
||||
/** GRPC service for interrogating domain links for a single partition. For accessing the data
|
||||
* in the application, the AggregateLinkGraphService should be used instead via the
|
||||
* AggregateLinkGraphClient.
|
||||
*/
|
||||
public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksApiImplBase {
|
||||
private final DomainLinkDb domainLinkDb;
|
||||
public class PartitionLinkGraphService extends LinkGraphApiGrpc.LinkGraphApiImplBase {
|
||||
private final DomainLinks domainLinks;
|
||||
|
||||
@Inject
|
||||
public PartitionDomainLinksService(DomainLinkDb domainLinkDb) {
|
||||
this.domainLinkDb = domainLinkDb;
|
||||
public PartitionLinkGraphService(DomainLinks domainLinks) {
|
||||
this.domainLinks = domainLinks;
|
||||
}
|
||||
|
||||
public void getAllLinks(Empty request,
|
||||
io.grpc.stub.StreamObserver<RpcDomainIdPairs> responseObserver) {
|
||||
|
||||
try (var idsConverter = new AllIdsResponseConverter(responseObserver)) {
|
||||
domainLinkDb.forEach(idsConverter::accept);
|
||||
domainLinks.forEach(idsConverter::accept);
|
||||
}
|
||||
|
||||
responseObserver.onCompleted();
|
||||
@ -58,7 +61,7 @@ public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
|
||||
public void getLinksFromDomain(RpcDomainId request,
|
||||
StreamObserver<RpcDomainIdList> responseObserver) {
|
||||
|
||||
var links = domainLinkDb.findDestinations(request.getDomainId());
|
||||
var links = domainLinks.findDestinations(request.getDomainId());
|
||||
|
||||
var rspBuilder = RpcDomainIdList.newBuilder();
|
||||
for (int i = 0; i < links.size(); i++) {
|
||||
@ -73,7 +76,7 @@ public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
|
||||
public void getLinksToDomain(RpcDomainId request,
|
||||
StreamObserver<RpcDomainIdList> responseObserver) {
|
||||
|
||||
var links = domainLinkDb.findSources(request.getDomainId());
|
||||
var links = domainLinks.findSources(request.getDomainId());
|
||||
|
||||
var rspBuilder = RpcDomainIdList.newBuilder();
|
||||
for (int i = 0; i < links.size(); i++) {
|
||||
@ -87,7 +90,7 @@ public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
|
||||
public void countLinksFromDomain(RpcDomainId request,
|
||||
StreamObserver<RpcDomainIdCount> responseObserver) {
|
||||
responseObserver.onNext(RpcDomainIdCount.newBuilder()
|
||||
.setIdCount(domainLinkDb.countDestinations(request.getDomainId()))
|
||||
.setIdCount(domainLinks.countDestinations(request.getDomainId()))
|
||||
.build());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
@ -95,7 +98,7 @@ public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
|
||||
public void countLinksToDomain(RpcDomainId request,
|
||||
StreamObserver<RpcDomainIdCount> responseObserver) {
|
||||
responseObserver.onNext(RpcDomainIdCount.newBuilder()
|
||||
.setIdCount(domainLinkDb.countSources(request.getDomainId()))
|
||||
.setIdCount(domainLinks.countSources(request.getDomainId()))
|
||||
.build());
|
||||
responseObserver.onCompleted();
|
||||
}
|
@ -1,7 +1,8 @@
|
||||
package nu.marginalia.linkdb.dlinks;
|
||||
package nu.marginalia.linkgraph.impl;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.linkgraph.DomainLinks;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -14,13 +15,13 @@ import java.nio.file.StandardCopyOption;
|
||||
* is not yet loaded. This speeds up the startup of the index service, as the database is
|
||||
* loaded in a separate thread.
|
||||
*/
|
||||
public class DelayingDomainLinkDb implements DomainLinkDb {
|
||||
private final static Logger logger = LoggerFactory.getLogger(DelayingDomainLinkDb.class);
|
||||
public class DelayingDomainLinks implements DomainLinks {
|
||||
private final static Logger logger = LoggerFactory.getLogger(DelayingDomainLinks.class);
|
||||
|
||||
private volatile DomainLinkDb currentDb;
|
||||
private volatile DomainLinks currentDb;
|
||||
private final Path filename;
|
||||
|
||||
public DelayingDomainLinkDb(@Named("domain-linkdb-file") Path filename) {
|
||||
public DelayingDomainLinks(@Named("domain-linkdb-file") Path filename) {
|
||||
this.filename = filename;
|
||||
|
||||
// Load the database in a separate thread, so that the constructor can return
|
||||
@ -29,7 +30,7 @@ public class DelayingDomainLinkDb implements DomainLinkDb {
|
||||
|
||||
Thread.ofPlatform().start(() -> {
|
||||
try {
|
||||
currentDb = new FileDomainLinkDb(filename);
|
||||
currentDb = new FileDomainLinks(filename);
|
||||
logger.info("Loaded linkdb");
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to load linkdb", e);
|
||||
@ -43,7 +44,7 @@ public class DelayingDomainLinkDb implements DomainLinkDb {
|
||||
|
||||
Thread.ofPlatform().start(() -> {
|
||||
try {
|
||||
currentDb = new FileDomainLinkDb(filename);
|
||||
currentDb = new FileDomainLinks(filename);
|
||||
} catch (IOException e) {
|
||||
logger.error("Failed to load linkdb", e);
|
||||
}
|
@ -1,7 +1,9 @@
|
||||
package nu.marginalia.linkdb.dlinks;
|
||||
package nu.marginalia.linkgraph.impl;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.linkgraph.DomainLinks;
|
||||
import nu.marginalia.linkgraph.io.DomainLinksLoader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -14,13 +16,13 @@ import java.util.Arrays;
|
||||
/** Canonical DomainLinkDb implementation. The database is loaded into memory from
|
||||
* a file. The database is then kept in memory, reloading it upon switchInput().
|
||||
*/
|
||||
public class FileDomainLinkDb implements DomainLinkDb {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FileDomainLinkDb.class);
|
||||
public class FileDomainLinks implements DomainLinks {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FileDomainLinks.class);
|
||||
private final Path filename;
|
||||
private volatile long[] sourceToDest = new long[0];
|
||||
private volatile long[] destToSource = new long[0];
|
||||
|
||||
public FileDomainLinkDb(@Named("domain-linkdb-file") Path filename) throws IOException {
|
||||
public FileDomainLinks(@Named("domain-linkdb-file") Path filename) throws IOException {
|
||||
this.filename = filename;
|
||||
|
||||
if (Files.exists(filename)) {
|
||||
@ -35,7 +37,7 @@ public class FileDomainLinkDb implements DomainLinkDb {
|
||||
}
|
||||
|
||||
public void loadInput(Path filename) throws IOException {
|
||||
try (var loader = new DomainLinkDbLoader(filename)) {
|
||||
try (var loader = new DomainLinksLoader(filename)) {
|
||||
int size = loader.size();
|
||||
|
||||
var newSourceToDest = new long[size];
|
@ -1,17 +1,17 @@
|
||||
package nu.marginalia.linkdb.dlinks;
|
||||
package nu.marginalia.linkgraph.io;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class DomainLinkDbLoader implements AutoCloseable {
|
||||
public class DomainLinksLoader implements AutoCloseable {
|
||||
private final DataInputStream stream;
|
||||
private final Path filename;
|
||||
|
||||
private long nextVal;
|
||||
|
||||
public DomainLinkDbLoader(Path filename) throws IOException {
|
||||
public DomainLinksLoader(Path filename) throws IOException {
|
||||
this.stream = new DataInputStream(Files.newInputStream(filename));
|
||||
this.filename = filename;
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.linkdb.dlinks;
|
||||
package nu.marginalia.linkgraph.io;
|
||||
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
@ -6,10 +6,10 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class DomainLinkDbWriter implements AutoCloseable {
|
||||
public class DomainLinksWriter implements AutoCloseable {
|
||||
private final DataOutputStream stream;
|
||||
|
||||
public DomainLinkDbWriter(Path fileName) throws IOException {
|
||||
public DomainLinksWriter(Path fileName) throws IOException {
|
||||
this.stream = new DataOutputStream(Files.newOutputStream(fileName,
|
||||
StandardOpenOption.CREATE,
|
||||
StandardOpenOption.WRITE,
|
11
code/functions/link-graph/partition/readme.md
Normal file
11
code/functions/link-graph/partition/readme.md
Normal file
@ -0,0 +1,11 @@
|
||||
The link graph partition module is responsible for knowledge about the link graph
|
||||
for a single index node. It's based on in-memory data structures, and is updated
|
||||
atomically from file.
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [PartitionLinkGraphService](java/nu/marginalia/linkgraph/PartitionLinkGraphService.java)
|
||||
* [DomainLink](java/nu/marginalia/linkgraph/DomainLinks.java)
|
||||
* * [FileDomainLinks](java/nu/marginalia/linkgraph/impl/FileDomainLinks.java)
|
||||
* [DomainLinksWriter](java/nu/marginalia/linkgraph/io/DomainLinksWriter.java)
|
||||
* [DomainLinksLoader](java/nu/marginalia/linkgraph/io/DomainLinksLoader.java)
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.linkdb;
|
||||
package nu.marginalia.linkgraph;
|
||||
|
||||
import nu.marginalia.linkdb.dlinks.DomainLinkDbLoader;
|
||||
import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter;
|
||||
import nu.marginalia.linkgraph.io.DomainLinksLoader;
|
||||
import nu.marginalia.linkgraph.io.DomainLinksWriter;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@ -24,7 +24,7 @@ public class DomainLinkDbTest {
|
||||
|
||||
@Test
|
||||
public void testWriteRead() {
|
||||
try (var writer = new DomainLinkDbWriter(fileName)) {
|
||||
try (var writer = new DomainLinksWriter(fileName)) {
|
||||
writer.write(1, 2);
|
||||
writer.write(2, 3);
|
||||
writer.write(3, 4);
|
||||
@ -33,7 +33,7 @@ public class DomainLinkDbTest {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
try (var reader = new DomainLinkDbLoader(fileName)) {
|
||||
try (var reader = new DomainLinksLoader(fileName)) {
|
||||
Assertions.assertTrue(reader.next());
|
||||
Assertions.assertEquals(1, reader.getSource());
|
||||
Assertions.assertEquals(2, reader.getDest());
|
9
code/functions/link-graph/readme.md
Normal file
9
code/functions/link-graph/readme.md
Normal file
@ -0,0 +1,9 @@
|
||||
The link graph subsystem is responsible for knowledge about the link graph.
|
||||
|
||||
A SQL database is not very well suited for this, principally it's too slow to update,
|
||||
instead the link graph is stored in memory, and atomically updated from file. The storage
|
||||
aspect is handled by the [common/linkdb](../../common/linkdb/) component.
|
||||
|
||||
The link graph subsystem has two components, one which injects into the partitioned services,
|
||||
e.g. index or execution, and one which aggregates the results from the partitioned services,
|
||||
and exposes a unified view of the link graph.
|
@ -5,7 +5,10 @@ import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.BitSet;
|
||||
|
||||
// It's unclear why this exists, we should probably use a BitSet instead?
|
||||
// Chesterton's fence?
|
||||
public class DenseBitMap {
|
||||
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
||||
|
||||
|
4
code/functions/search-query/readme.md
Normal file
4
code/functions/search-query/readme.md
Normal file
@ -0,0 +1,4 @@
|
||||
The search query subsystem is responsible for parsing a query,
|
||||
translating it to a request, and then dispatching it to the
|
||||
appropriate index nodes and translating the responses back again.
|
||||
|
@ -17,7 +17,7 @@ dependencies {
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:functions:domain-links:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.ranking.domains.data;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
@ -13,12 +13,12 @@ import org.jgrapht.graph.DefaultEdge;
|
||||
* which is the same as the regular graph except
|
||||
* the direction of the links have been inverted */
|
||||
public class InvertedLinkGraphSource extends AbstractGraphSource {
|
||||
private final AggregateDomainLinksClient queryClient;
|
||||
private final AggregateLinkGraphClient graphClient;
|
||||
|
||||
@Inject
|
||||
public InvertedLinkGraphSource(HikariDataSource dataSource, AggregateDomainLinksClient queryClient) {
|
||||
public InvertedLinkGraphSource(HikariDataSource dataSource, AggregateLinkGraphClient graphClient) {
|
||||
super(dataSource);
|
||||
this.queryClient = queryClient;
|
||||
this.graphClient = graphClient;
|
||||
}
|
||||
@SneakyThrows
|
||||
@Override
|
||||
@ -27,7 +27,7 @@ public class InvertedLinkGraphSource extends AbstractGraphSource {
|
||||
|
||||
addVertices(graph);
|
||||
|
||||
var allLinks = queryClient.getAllDomainLinks();
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
|
@ -3,19 +3,19 @@ package nu.marginalia.ranking.domains.data;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
|
||||
/** A source for the regular link graph. */
|
||||
public class LinkGraphSource extends AbstractGraphSource {
|
||||
private final AggregateDomainLinksClient domainLinksClient;
|
||||
private final AggregateLinkGraphClient graphClient;
|
||||
|
||||
@Inject
|
||||
public LinkGraphSource(HikariDataSource dataSource, AggregateDomainLinksClient domainLinksClient) {
|
||||
public LinkGraphSource(HikariDataSource dataSource, AggregateLinkGraphClient graphClient) {
|
||||
super(dataSource);
|
||||
this.domainLinksClient = domainLinksClient;
|
||||
this.graphClient = graphClient;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -25,7 +25,7 @@ public class LinkGraphSource extends AbstractGraphSource {
|
||||
|
||||
addVertices(graph);
|
||||
|
||||
var allLinks = domainLinksClient.getAllDomainLinks();
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Index
|
||||
|
||||
This module contains the components that make up the search index.
|
||||
This index subsystem contains the components that make up the search index.
|
||||
|
||||
It exposes an API for querying the index, and contains the logic
|
||||
for ranking search results. It does not parse the query, that is
|
||||
@ -10,9 +10,9 @@ the responsibility of the [search-query](../functions/search-query) module.
|
||||
|
||||
There are two indexes with accompanying tools for constructing them.
|
||||
|
||||
* [index-reverse](reverse-index/) is code for `word->document` indexes. There are two such indexes, one containing only document-word pairs that are flagged as important, e.g. the word appears in the title or has a high TF-IDF. This allows good results to be discovered quickly without having to sift through ten thousand bad ones first.
|
||||
* [index-reverse](index-reverse/) is code for `word->document` indexes. There are two such indexes, one containing only document-word pairs that are flagged as important, e.g. the word appears in the title or has a high TF-IDF. This allows good results to be discovered quickly without having to sift through ten thousand bad ones first.
|
||||
|
||||
* [index-forward](forward-index/) is the `document->word` index containing metadata about each word, such as its position. It is used after identifying candidate search results via the reverse index to fetch metadata and rank the results.
|
||||
* [index-forward](index-forward/) is the `document->word` index containing metadata about each word, such as its position. It is used after identifying candidate search results via the reverse index to fetch metadata and rank the results.
|
||||
|
||||
Additionally, the [index-journal](index-journal/) contains code for constructing a journal of the index, which is used to keep the index up to date.
|
||||
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.ranking.domains;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.ranking.domains.data.InvertedLinkGraphSource;
|
||||
import nu.marginalia.ranking.domains.data.LinkGraphSource;
|
||||
import nu.marginalia.ranking.domains.data.SimilarityGraphSource;
|
||||
@ -36,8 +36,8 @@ public class RankingAlgorithmsContainerTest {
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
|
||||
AggregateDomainLinksClient domainLinksClient;
|
||||
AggregateDomainLinksClient.AllLinks allLinks;
|
||||
AggregateLinkGraphClient domainLinksClient;
|
||||
AggregateLinkGraphClient.AllLinks allLinks;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() {
|
||||
@ -66,8 +66,8 @@ public class RankingAlgorithmsContainerTest {
|
||||
|
||||
@BeforeEach
|
||||
public void setupQueryClient() {
|
||||
domainLinksClient = Mockito.mock(AggregateDomainLinksClient.class);
|
||||
allLinks = new AggregateDomainLinksClient.AllLinks();
|
||||
domainLinksClient = Mockito.mock(AggregateLinkGraphClient.class);
|
||||
allLinks = new AggregateLinkGraphClient.AllLinks();
|
||||
when(domainLinksClient.getAllDomainLinks()).thenReturn(allLinks);
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
|
@ -12,5 +12,3 @@ its words, how they stem, POS tags, and so on.
|
||||
|
||||
[features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords
|
||||
are important.
|
||||
|
||||
[features-qs/query-parser](../../features-qs/query-parser) also does some language processing.
|
@ -6,7 +6,3 @@ the TF-IDF score of a keyword.
|
||||
## Central Classes
|
||||
|
||||
* [TermFrequencyDict](java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java)
|
||||
|
||||
## See Also
|
||||
|
||||
* [tools/term-frequency-extractor](../../tools/term-frequency-extractor) constructs this file
|
@ -10,8 +10,8 @@ There are three types of indexes:
|
||||
|
||||
This is a very light-weight module that delegates the actual work to the modules:
|
||||
|
||||
* [features-index/index-reverse](../../features-index/index-reverse)
|
||||
* [features-index/index-forward](../../features-index/index-forward)
|
||||
* [features-index/index-reverse](../../index/index-reverse)
|
||||
* [features-index/index-forward](../../index/index-forward)
|
||||
|
||||
Their respective readme files contain more information about the indexes themselves
|
||||
and how they are constructed.
|
||||
|
@ -40,6 +40,8 @@ dependencies {
|
||||
implementation project(':code:process-models:work-log')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
|
||||
implementation project(':code:functions:link-graph:partition')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.guice
|
||||
|
@ -9,7 +9,7 @@ import com.google.inject.name.Names;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter;
|
||||
import nu.marginalia.linkgraph.io.DomainLinksWriter;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
@ -45,7 +45,7 @@ public class LoaderModule extends AbstractModule {
|
||||
}
|
||||
|
||||
@Inject @Provides @Singleton
|
||||
private DomainLinkDbWriter createDomainLinkdbWriter(FileStorageService service) throws SQLException, IOException {
|
||||
private DomainLinksWriter createDomainLinkdbWriter(FileStorageService service) throws SQLException, IOException {
|
||||
|
||||
Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOMAIN_LINKS_FILE_NAME);
|
||||
|
||||
@ -53,7 +53,7 @@ public class LoaderModule extends AbstractModule {
|
||||
Files.delete(dbPath);
|
||||
}
|
||||
|
||||
return new DomainLinkDbWriter(dbPath);
|
||||
return new DomainLinksWriter(dbPath);
|
||||
}
|
||||
|
||||
private Gson createGson() {
|
||||
|
@ -4,7 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader;
|
||||
import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter;
|
||||
import nu.marginalia.linkgraph.io.DomainLinksWriter;
|
||||
import nu.marginalia.loading.LoaderInputData;
|
||||
import nu.marginalia.loading.domains.DomainIdRegistry;
|
||||
import nu.marginalia.model.processed.DomainLinkRecord;
|
||||
@ -20,10 +20,10 @@ public class DomainLinksLoaderService {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainLinksLoaderService.class);
|
||||
|
||||
private final DomainLinkDbWriter domainLinkDbWriter;
|
||||
private final DomainLinksWriter domainLinkDbWriter;
|
||||
|
||||
@Inject
|
||||
public DomainLinksLoaderService(DomainLinkDbWriter domainLinkDbWriter) {
|
||||
public DomainLinksLoaderService(DomainLinksWriter domainLinkDbWriter) {
|
||||
this.domainLinkDbWriter = domainLinkDbWriter;
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,7 @@ described in [processed-data](../process-models/processed-data/).
|
||||
|
||||
The [loading-process](loading-process/) reads the processed data.
|
||||
|
||||
It has creates an [index journal](../features-index/index-journal),
|
||||
It has creates an [index journal](../index/index-journal),
|
||||
a [link database](../common/linkdb),
|
||||
and loads domains and domain-links
|
||||
into the [MariaDB database](../common/db).
|
||||
|
@ -25,7 +25,7 @@ dependencies {
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:common:service-discovery')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:functions:domain-links:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -4,7 +4,7 @@ import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -35,7 +35,7 @@ public class AdjacenciesData {
|
||||
return ret;
|
||||
}
|
||||
|
||||
public AdjacenciesData(AggregateDomainLinksClient linksClient,
|
||||
public AdjacenciesData(AggregateLinkGraphClient linksClient,
|
||||
DomainAliases aliases) {
|
||||
logger.info("Loading adjacency data");
|
||||
|
||||
|
@ -4,7 +4,7 @@ import com.google.inject.Guice;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
@ -32,7 +32,7 @@ public class WebsiteAdjacenciesCalculator extends ProcessMainClass {
|
||||
private static final Logger logger = LoggerFactory.getLogger(WebsiteAdjacenciesCalculator.class);
|
||||
|
||||
float[] weights;
|
||||
public WebsiteAdjacenciesCalculator(AggregateDomainLinksClient domainLinksClient, HikariDataSource dataSource) throws SQLException {
|
||||
public WebsiteAdjacenciesCalculator(AggregateLinkGraphClient domainLinksClient, HikariDataSource dataSource) throws SQLException {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
domainAliases = new DomainAliases(dataSource);
|
||||
@ -154,7 +154,7 @@ public class WebsiteAdjacenciesCalculator extends ProcessMainClass {
|
||||
|
||||
|
||||
var dataSource = injector.getInstance(HikariDataSource.class);
|
||||
var lc = injector.getInstance(AggregateDomainLinksClient.class);
|
||||
var lc = injector.getInstance(AggregateLinkGraphClient.class);
|
||||
|
||||
if (!lc.waitReady(Duration.ofSeconds(30))) {
|
||||
throw new IllegalStateException("Failed to connect to domain-links");
|
||||
|
@ -23,18 +23,38 @@ eligible index services. The control service is responsible for distributing co
|
||||
service, and for monitoring the health of the system. It also offers a web interface for operating the system.
|
||||
|
||||
### Services
|
||||
|
||||
* [core services](services-core/) Most of these services are stateful, memory hungry, and doing heavy lifting.
|
||||
* * [control](services-core/control-service)
|
||||
* * [query](services-core/query-service)
|
||||
* * * Exposes the [functions/link-graph](functions/link-graph) subsystem
|
||||
* * * Exposes the [functions/search-query](functions/search-query) subsystem
|
||||
* * [index](services-core/index-service)
|
||||
* * * Exposes the [index](index) subsystem
|
||||
* * * Exposes the [functions/link-graph](functions/link-graph) subsystem
|
||||
* * [executor](services-core/executor-service)
|
||||
* * * Exposes the [execution](execution) subsystem
|
||||
* * [assistant](services-core/assistant-service)
|
||||
* * * Exposes the [functions/math](functions/math) subsystem
|
||||
* * * Exposes the [functions/domain-info](functions/domain-info) subsystem
|
||||
* [application services](services-application/) Mostly stateless gateways providing access to the core services.
|
||||
* * [api](services-application/api-service) - public API
|
||||
* * [api](services-application/api-service) - public API gateway
|
||||
* * [search](services-application/search-service) - marginalia search application
|
||||
* * [dating](services-application/dating-service) - [https://explore.marginalia.nu/](https://explore.marginalia.nu/)
|
||||
* * [explorer](services-application/explorer-service) - [https://explore2.marginalia.nu/](https://explore2.marginalia.nu/)
|
||||
* an [internal API](api/)
|
||||
|
||||
The system uses a service registry to find the services. The service registry is based on zookeeper,
|
||||
and is a separate service. The registry doesn't keep track of processes, but APIs. This means that
|
||||
the system is flexible to reconfiguration. The same code can in principle be run as a micro-service
|
||||
mesh or as a monolith.
|
||||
|
||||
This is an unusual architecture, but it has the benefit that you don't need to think too much about
|
||||
the layout of the system. You can just request an API and talk to it. Because of this, several of the
|
||||
services have almost no code of their own. They merely import a library and expose it as a service.
|
||||
|
||||
These skeleton services are marked with (S).
|
||||
|
||||
Services that expose HTTP endpoints tend to have more code. They are marked with (G).
|
||||
|
||||
### Processes
|
||||
|
||||
@ -55,7 +75,6 @@ but isolated.
|
||||
* [features-search](features-search)
|
||||
* [features-crawl](features-crawl)
|
||||
* [features-convert](features-convert)
|
||||
* [features-index](features-index)
|
||||
|
||||
### Libraries and primitives
|
||||
|
||||
|
@ -4,8 +4,7 @@ The control service provides an operator's user interface. By default, this int
|
||||
exposed on port 8081. It does not offer any sort of access control or authentication.
|
||||
|
||||
The control service will itself execute tasks that affect the entire system, but delegate
|
||||
node-specific tasks to the corresponding [executor-service](../executor-service) via the
|
||||
[executor-api](../../api/executor-api).
|
||||
node-specific tasks to the corresponding to the [execution subsystem](../../execution).
|
||||
|
||||
Conceptually the application is broken into three parts:
|
||||
|
||||
|
@ -59,7 +59,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation project(':code:functions:domain-links:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:process-models:crawl-spec')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
@ -1,16 +1,10 @@
|
||||
The executor service is a partitioned service responsible for executing and keeping
|
||||
track of long running maintenance and operational tasks, such as crawling or data
|
||||
track of long-running maintenance and operational tasks, such as crawling or data
|
||||
processing.
|
||||
|
||||
It accomplishes this using the [message queue and actor library](../../libraries/message-queue/),
|
||||
which permits program state to survive crashes and reboots. The executor service is closely
|
||||
linked to the [control-service](../control-service), which provides a user interface for
|
||||
much of the executor's functionality.
|
||||
The executor service is closely linked to the [control-service](../control-service),
|
||||
which provides a user interface for much of the executor's functionality.
|
||||
|
||||
## Central Classes
|
||||
The service it itself relatively bare of code, but imports and exposes the [execution subsystem](../../execution),
|
||||
which is responsible for the actual execution of tasks.
|
||||
|
||||
* [ExecutorActorControlService](java/nu/marginalia/actor/ExecutorActorControlService.java)
|
||||
|
||||
## See Also
|
||||
|
||||
* [api/executor-api](../../api/executor-api)
|
@ -46,8 +46,8 @@ dependencies {
|
||||
implementation project(':code:common:linkdb')
|
||||
|
||||
implementation project(':code:index')
|
||||
implementation project(':code:functions:domain-links:partition')
|
||||
implementation project(':code:functions:domain-links:api')
|
||||
implementation project(':code:functions:link-graph:partition')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
implementation project(':code:index:api')
|
||||
|
||||
|
@ -4,8 +4,8 @@ import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.linkdb.dlinks.DomainLinkDb;
|
||||
import nu.marginalia.linkdb.dlinks.DelayingDomainLinkDb;
|
||||
import nu.marginalia.linkgraph.DomainLinks;
|
||||
import nu.marginalia.linkgraph.impl.DelayingDomainLinks;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import org.slf4j.Logger;
|
||||
@ -26,13 +26,13 @@ public class IndexModule extends AbstractModule {
|
||||
|
||||
@Provides
|
||||
@Singleton
|
||||
public DomainLinkDb domainLinkDb (
|
||||
public DomainLinks domainLinkDb (
|
||||
FileStorageService storageService
|
||||
)
|
||||
{
|
||||
Path path = IndexLocations.getLinkdbLivePath(storageService).resolve(DOMAIN_LINKS_FILE_NAME);
|
||||
|
||||
return new DelayingDomainLinkDb(path);
|
||||
return new DelayingDomainLinks(path);
|
||||
}
|
||||
|
||||
@Provides
|
||||
|
@ -3,9 +3,9 @@ package nu.marginalia.index;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.functions.domainlinks.PartitionDomainLinksService;
|
||||
import nu.marginalia.linkgraph.PartitionLinkGraphService;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.linkdb.dlinks.DomainLinkDb;
|
||||
import nu.marginalia.linkgraph.DomainLinks;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.index.api.IndexMqEndpoints;
|
||||
@ -34,7 +34,7 @@ public class IndexService extends Service {
|
||||
private final FileStorageService fileStorageService;
|
||||
private final DocumentDbReader documentDbReader;
|
||||
|
||||
private final DomainLinkDb domainLinkDb;
|
||||
private final DomainLinks domainLinks;
|
||||
private final ServiceEventLog eventLog;
|
||||
|
||||
|
||||
@ -46,21 +46,21 @@ public class IndexService extends Service {
|
||||
StatefulIndex statefulIndex,
|
||||
FileStorageService fileStorageService,
|
||||
DocumentDbReader documentDbReader,
|
||||
DomainLinkDb domainLinkDb,
|
||||
PartitionDomainLinksService partitionDomainLinksService,
|
||||
DomainLinks domainLinks,
|
||||
PartitionLinkGraphService partitionLinkGraphService,
|
||||
ServiceEventLog eventLog)
|
||||
{
|
||||
super(params,
|
||||
ServicePartition.partition(params.configuration.node()),
|
||||
List.of(indexQueryService,
|
||||
partitionDomainLinksService)
|
||||
partitionLinkGraphService)
|
||||
);
|
||||
|
||||
this.opsService = opsService;
|
||||
this.statefulIndex = statefulIndex;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.documentDbReader = documentDbReader;
|
||||
this.domainLinkDb = domainLinkDb;
|
||||
this.domainLinks = domainLinks;
|
||||
this.eventLog = eventLog;
|
||||
|
||||
this.init = params.initialization;
|
||||
@ -106,7 +106,7 @@ public class IndexService extends Service {
|
||||
|
||||
if (Files.exists(newPathDomains)) {
|
||||
eventLog.logEvent("INDEX-SWITCH-DOMAIN-LINKDB", "");
|
||||
domainLinkDb.switchInput(newPathDomains);
|
||||
domainLinks.switchInput(newPathDomains);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -50,8 +50,8 @@ dependencies {
|
||||
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
implementation project(':code:functions:domain-links:api')
|
||||
implementation project(':code:functions:domain-links:aggregate')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:link-graph:aggregate')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.query;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.functions.domainlinks.AggregateDomainLinksService;
|
||||
import nu.marginalia.linkgraph.AggregateLinkGraphService;
|
||||
import nu.marginalia.functions.searchquery.QueryGRPCService;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
@ -17,7 +17,7 @@ public class QueryService extends Service {
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public QueryService(BaseServiceParams params,
|
||||
AggregateDomainLinksService domainLinksService,
|
||||
AggregateLinkGraphService domainLinksService,
|
||||
QueryGRPCService queryGRPCService,
|
||||
QueryBasicInterface queryBasicInterface)
|
||||
{
|
||||
|
@ -17,9 +17,9 @@ include 'code:functions:math:api'
|
||||
include 'code:functions:domain-info'
|
||||
include 'code:functions:domain-info:api'
|
||||
|
||||
include 'code:functions:domain-links:partition'
|
||||
include 'code:functions:domain-links:aggregate'
|
||||
include 'code:functions:domain-links:api'
|
||||
include 'code:functions:link-graph:partition'
|
||||
include 'code:functions:link-graph:aggregate'
|
||||
include 'code:functions:link-graph:api'
|
||||
|
||||
include 'code:functions:search-query'
|
||||
include 'code:functions:search-query:api'
|
||||
|
Loading…
Reference in New Issue
Block a user