Clean up documentation and rename domain-links to link-graph

This commit is contained in:
Viktor Lofgren 2024-02-28 11:40:11 +01:00
parent 3a65fe8917
commit 9f1649636e
54 changed files with 269 additions and 224 deletions

View File

@ -1,15 +1,3 @@
## Domain Link Database
The domain link database contains information about links
between domains. It is a static in-memory database loaded
from a binary file.
* [DomainLinkDb](java/nu/marginalia/linkdb/DomainLinkDb.java)
* * [FileDomainLinkDb](java/nu/marginalia/linkdb/FileDomainLinkDb.java)
* * [SqlDomainLinkDb](java/nu/marginalia/linkdb/SqlDomainLinkDb.java)
* [DomainLinkDbWriter](java/nu/marginalia/linkdb/DomainLinkDbWriter.java)
* [DomainLinkDbLoader](java/nu/marginalia/linkdb/DomainLinkDbLoader.java)
## Document Database ## Document Database
The document database contains information about links, The document database contains information about links,
@ -21,10 +9,10 @@ is not in the MariaDB database is that this would make updates to
this information take effect in production immediately, even before this information take effect in production immediately, even before
the information was searchable. the information was searchable.
* [DocumentLinkDbWriter](java/nu/marginalia/linkdb/DocumentDbWriter.java) * [DocumentLinkDbWriter](java/nu/marginalia/linkdb/docs/DocumentDbWriter.java)
* [DocumentLinkDbLoader](java/nu/marginalia/linkdb/DocumentDbReader.java) * [DocumentLinkDbLoader](java/nu/marginalia/linkdb/docs/DocumentDbReader.java)
## See Also ## See Also
These databases are constructed by the [loading-process](../../processes/loading-process), and consumed by the [index-service](../../services-core/index-service). The database is constructed by the [loading-process](../../processes/loading-process), and consumed by the [index-service](../../services-core/index-service).

View File

@ -7,6 +7,5 @@ as shared models.
* [config](config/) contains some `@Inject`ables. * [config](config/) contains some `@Inject`ables.
* [renderer](renderer/) contains utility code for rendering website templates. * [renderer](renderer/) contains utility code for rendering website templates.
* [service](service/) is the shared base classes for main methods and web services. * [service](service/) is the shared base classes for main methods and web services.
* [service-client](service-client/) is the shared base class for RPC. * [service-discovery](service-discovery) contains tools that lets the services find each other and communicate.
* [service-discovery](service-discovery) contains tools that lets the services find each other.
* [process](process/) contains boiler plate for batch processes. * [process](process/) contains boiler plate for batch processes.

View File

@ -34,7 +34,7 @@ dependencies {
implementation project(':code:libraries:message-queue') implementation project(':code:libraries:message-queue')
implementation project(':code:functions:domain-links:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:execution:api') implementation project(':code:execution:api')
implementation project(':code:process-models:crawl-spec') implementation project(':code:process-models:crawl-spec')

View File

@ -6,7 +6,7 @@ import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
@ -32,7 +32,7 @@ public class ExportDataActor extends RecordActorPrototype {
private final FileStorageService storageService; private final FileStorageService storageService;
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final AggregateDomainLinksClient domainLinksClient; private final AggregateLinkGraphClient linkGraphClient;
public record Export() implements ActorStep {} public record Export() implements ActorStep {}
public record ExportBlacklist(FileStorageId fid) implements ActorStep {} public record ExportBlacklist(FileStorageId fid) implements ActorStep {}
@ -114,7 +114,7 @@ public class ExportDataActor extends RecordActorPrototype {
var tmpFile = Files.createTempFile(storage.asPath(), "export", ".csv.gz", var tmpFile = Files.createTempFile(storage.asPath(), "export", ".csv.gz",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
var allLinks = domainLinksClient.getAllDomainLinks(); var allLinks = linkGraphClient.getAllDomainLinks();
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))))) try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))))
{ {
@ -155,12 +155,12 @@ public class ExportDataActor extends RecordActorPrototype {
public ExportDataActor(Gson gson, public ExportDataActor(Gson gson,
FileStorageService storageService, FileStorageService storageService,
HikariDataSource dataSource, HikariDataSource dataSource,
AggregateDomainLinksClient domainLinksClient) AggregateLinkGraphClient linkGraphClient)
{ {
super(gson); super(gson);
this.storageService = storageService; this.storageService = storageService;
this.dataSource = dataSource; this.dataSource = dataSource;
this.domainLinksClient = domainLinksClient; this.linkGraphClient = linkGraphClient;
} }
} }

12
code/execution/readme.md Normal file
View File

@ -0,0 +1,12 @@
The execution subsystem is responsible for the execution of long running tasks on each
index node. It lives in the [executor-service](../services-core/executor-service) module.
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
which permits program state to survive crashes and reboots.
The subsystem exposes four [APIs](api/src/main/protobuf/executor-api.proto):
* Execution API - for starting and stopping tasks, also contains miscellaneous commands
* Crawl API - for managing the crawl workflow
* Sideload API - for sideloading data
* Export API - for exporting data

View File

@ -16,4 +16,3 @@ holistically, not by question or answer, it is necessary to re-arrange
the data (which is very large). SQLite does a decent job of enabling the data (which is very large). SQLite does a decent job of enabling
this task. this task.
See [tools/stackexchange-converter](../../tools/stackexchange-converter).

View File

@ -15,7 +15,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies { dependencies {
implementation project(':code:functions:domain-info:api') implementation project(':code:functions:domain-info:api')
implementation project(':code:functions:domain-links:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:common:service') implementation project(':code:common:service')

View File

@ -2,7 +2,7 @@ package nu.marginalia.functions.domains;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.api.domains.RpcDomainInfoResponse; import nu.marginalia.api.domains.RpcDomainInfoResponse;
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DbDomainQueries;
@ -21,7 +21,7 @@ public class DomainInformationService {
private final GeoIpDictionary geoIpDictionary; private final GeoIpDictionary geoIpDictionary;
private DbDomainQueries dbDomainQueries; private DbDomainQueries dbDomainQueries;
private final AggregateDomainLinksClient domainLinksClient; private final AggregateLinkGraphClient linkGraphClient;
private HikariDataSource dataSource; private HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@ -29,11 +29,11 @@ public class DomainInformationService {
public DomainInformationService( public DomainInformationService(
DbDomainQueries dbDomainQueries, DbDomainQueries dbDomainQueries,
GeoIpDictionary geoIpDictionary, GeoIpDictionary geoIpDictionary,
AggregateDomainLinksClient domainLinksClient, AggregateLinkGraphClient linkGraphClient,
HikariDataSource dataSource) { HikariDataSource dataSource) {
this.dbDomainQueries = dbDomainQueries; this.dbDomainQueries = dbDomainQueries;
this.geoIpDictionary = geoIpDictionary; this.geoIpDictionary = geoIpDictionary;
this.domainLinksClient = domainLinksClient; this.linkGraphClient = linkGraphClient;
this.dataSource = dataSource; this.dataSource = dataSource;
} }
@ -84,8 +84,8 @@ public class DomainInformationService {
inCrawlQueue = rs.next(); inCrawlQueue = rs.next();
builder.setInCrawlQueue(inCrawlQueue); builder.setInCrawlQueue(inCrawlQueue);
builder.setIncomingLinks(domainLinksClient.countLinksToDomain(domainId)); builder.setIncomingLinks(linkGraphClient.countLinksToDomain(domainId));
builder.setOutboundLinks(domainLinksClient.countLinksFromDomain(domainId)); builder.setOutboundLinks(linkGraphClient.countLinksFromDomain(domainId));
rs = stmt.executeQuery(STR.""" rs = stmt.executeQuery(STR."""
SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId} SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId}

View File

@ -11,7 +11,7 @@ import gnu.trove.set.hash.TIntHashSet;
import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap; import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap;
import nu.marginalia.api.domains.*; import nu.marginalia.api.domains.*;
import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import org.roaringbitmap.RoaringBitmap; import org.roaringbitmap.RoaringBitmap;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -20,7 +20,6 @@ import org.slf4j.LoggerFactory;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.BitSet;
import java.util.List; import java.util.List;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
@ -29,7 +28,7 @@ public class SimilarDomainsService {
private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class); private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class);
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private final AggregateDomainLinksClient domainLinksClient; private final AggregateLinkGraphClient linkGraphClient;
private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000); private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
private volatile int[] domainIdxToId; private volatile int[] domainIdxToId;
@ -45,9 +44,9 @@ public class SimilarDomainsService {
volatile boolean isReady = false; volatile boolean isReady = false;
@Inject @Inject
public SimilarDomainsService(HikariDataSource dataSource, AggregateDomainLinksClient domainLinksClient) { public SimilarDomainsService(HikariDataSource dataSource, AggregateLinkGraphClient linkGraphClient) {
this.dataSource = dataSource; this.dataSource = dataSource;
this.domainLinksClient = domainLinksClient; this.linkGraphClient = linkGraphClient;
Executors.newSingleThreadExecutor().submit(this::init); Executors.newSingleThreadExecutor().submit(this::init);
} }
@ -262,7 +261,7 @@ public class SimilarDomainsService {
private TIntSet getLinkingIdsDToS(int domainIdx) { private TIntSet getLinkingIdsDToS(int domainIdx) {
var items = new TIntHashSet(); var items = new TIntHashSet();
for (int id : domainLinksClient.getLinksFromDomain(domainIdxToId[domainIdx])) { for (int id : linkGraphClient.getLinksFromDomain(domainIdxToId[domainIdx])) {
items.add(domainIdToIdx.get(id)); items.add(domainIdToIdx.get(id));
} }
@ -272,7 +271,7 @@ public class SimilarDomainsService {
private TIntSet getLinkingIdsSToD(int domainIdx) { private TIntSet getLinkingIdsSToD(int domainIdx) {
var items = new TIntHashSet(); var items = new TIntHashSet();
for (int id : domainLinksClient.getLinksToDomain(domainIdxToId[domainIdx])) { for (int id : linkGraphClient.getLinksToDomain(domainIdxToId[domainIdx])) {
items.add(domainIdToIdx.get(id)); items.add(domainIdToIdx.get(id));
} }

View File

@ -1,30 +0,0 @@
package nu.marginalia.api.indexdomainlinks;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.domainlink.DomainLinksApiGrpc;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Singleton
public class PartitionDomainLinksClient {
private static final Logger logger = LoggerFactory.getLogger(PartitionDomainLinksClient.class);
private final GrpcMultiNodeChannelPool<DomainLinksApiGrpc.DomainLinksApiBlockingStub> channelPool;
@Inject
public PartitionDomainLinksClient(GrpcChannelPoolFactory factory) {
this.channelPool = factory.createMulti(
ServiceKey.forGrpcApi(DomainLinksApiGrpc.class, ServicePartition.multi()),
DomainLinksApiGrpc::newBlockingStub);
}
public GrpcMultiNodeChannelPool<DomainLinksApiGrpc.DomainLinksApiBlockingStub> getChannelPool() {
return channelPool;
}
}

View File

@ -14,7 +14,7 @@ java {
apply from: "$rootProject.projectDir/srcsets.gradle" apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies { dependencies {
implementation project(':code:functions:domain-links:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:common:service') implementation project(':code:common:service')

View File

@ -1,20 +1,25 @@
package nu.marginalia.functions.domainlinks; package nu.marginalia.linkgraph;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.grpc.stub.StreamObserver; import io.grpc.stub.StreamObserver;
import nu.marginalia.api.domainlink.*; import nu.marginalia.api.linkgraph.*;
import nu.marginalia.api.indexdomainlinks.PartitionDomainLinksClient; import nu.marginalia.api.linkgraph.PartitionLinkGraphClient;
import nu.marginalia.api.linkgraph.LinkGraphApiGrpc;
import nu.marginalia.api.linkgraph.LinkGraphApiGrpc.LinkGraphApiBlockingStub;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.List; import java.util.List;
public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksApiImplBase { /** This class is responsible for aggregating the link graph data from the partitioned link graph
private static final Logger logger = LoggerFactory.getLogger(AggregateDomainLinksService.class); * services.
private final PartitionDomainLinksClient client; */
public class AggregateLinkGraphService extends LinkGraphApiGrpc.LinkGraphApiImplBase {
private static final Logger logger = LoggerFactory.getLogger(AggregateLinkGraphService.class);
private final PartitionLinkGraphClient client;
@Inject @Inject
public AggregateDomainLinksService(PartitionDomainLinksClient client) { public AggregateLinkGraphService(PartitionLinkGraphClient client) {
this.client = client; this.client = client;
} }
@ -22,7 +27,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
public void getAllLinks(Empty request, public void getAllLinks(Empty request,
StreamObserver<RpcDomainIdPairs> responseObserver) { StreamObserver<RpcDomainIdPairs> responseObserver) {
client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getAllLinks) client.getChannelPool().call(LinkGraphApiBlockingStub::getAllLinks)
.run(Empty.getDefaultInstance()) .run(Empty.getDefaultInstance())
.forEach(iter -> iter.forEachRemaining(responseObserver::onNext)); .forEach(iter -> iter.forEachRemaining(responseObserver::onNext));
@ -34,7 +39,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
StreamObserver<RpcDomainIdList> responseObserver) { StreamObserver<RpcDomainIdList> responseObserver) {
var rspBuilder = RpcDomainIdList.newBuilder(); var rspBuilder = RpcDomainIdList.newBuilder();
client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getLinksFromDomain) client.getChannelPool().call(LinkGraphApiBlockingStub::getLinksFromDomain)
.run(request) .run(request)
.stream() .stream()
.map(RpcDomainIdList::getDomainIdList) .map(RpcDomainIdList::getDomainIdList)
@ -51,7 +56,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
var rspBuilder = RpcDomainIdList.newBuilder(); var rspBuilder = RpcDomainIdList.newBuilder();
client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getLinksToDomain) client.getChannelPool().call(LinkGraphApiBlockingStub::getLinksToDomain)
.run(request) .run(request)
.stream() .stream()
.map(RpcDomainIdList::getDomainIdList) .map(RpcDomainIdList::getDomainIdList)
@ -65,7 +70,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
@Override @Override
public void countLinksFromDomain(RpcDomainId request, public void countLinksFromDomain(RpcDomainId request,
StreamObserver<RpcDomainIdCount> responseObserver) { StreamObserver<RpcDomainIdCount> responseObserver) {
int sum = client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::countLinksFromDomain) int sum = client.getChannelPool().call(LinkGraphApiBlockingStub::countLinksFromDomain)
.run(request) .run(request)
.stream() .stream()
.mapToInt(RpcDomainIdCount::getIdCount) .mapToInt(RpcDomainIdCount::getIdCount)
@ -81,7 +86,7 @@ public class AggregateDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
public void countLinksToDomain(RpcDomainId request, public void countLinksToDomain(RpcDomainId request,
StreamObserver<RpcDomainIdCount> responseObserver) { StreamObserver<RpcDomainIdCount> responseObserver) {
int sum = client.getChannelPool().call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::countLinksToDomain) int sum = client.getChannelPool().call(LinkGraphApiBlockingStub::countLinksToDomain)
.run(request) .run(request)
.stream() .stream()
.mapToInt(RpcDomainIdCount::getIdCount) .mapToInt(RpcDomainIdCount::getIdCount)

View File

@ -0,0 +1,3 @@
This module is responsible for aggregating the link graph from the partitioned services, and exposing a unified
view of the link graph. It does not keep any data or state, but instead delegates to the partitioned
services.

View File

@ -11,7 +11,7 @@ java {
} }
} }
jar.archiveBaseName = 'index-domain-links-api' jar.archiveBaseName = 'link-graph-api'
apply from: "$rootProject.projectDir/protobuf.gradle" apply from: "$rootProject.projectDir/protobuf.gradle"
apply from: "$rootProject.projectDir/srcsets.gradle" apply from: "$rootProject.projectDir/srcsets.gradle"

View File

@ -1,10 +1,8 @@
package nu.marginalia.api.indexdomainlinks; package nu.marginalia.api.linkgraph;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.api.domainlink.DomainLinksApiGrpc; import nu.marginalia.api.linkgraph.LinkGraphApiGrpc;
import nu.marginalia.api.domainlink.Empty;
import nu.marginalia.api.domainlink.RpcDomainId;
import nu.marginalia.service.client.GrpcChannelPoolFactory; import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcSingleNodeChannelPool; import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey; import nu.marginalia.service.discovery.property.ServiceKey;
@ -17,24 +15,26 @@ import org.slf4j.LoggerFactory;
import java.time.Duration; import java.time.Duration;
import java.util.List; import java.util.List;
@Singleton import static nu.marginalia.api.linkgraph.LinkGraphApiGrpc.*;
public class AggregateDomainLinksClient {
private static final Logger logger = LoggerFactory.getLogger(AggregateDomainLinksClient.class);
private final GrpcSingleNodeChannelPool<DomainLinksApiGrpc.DomainLinksApiBlockingStub> channelPool; @Singleton
public class AggregateLinkGraphClient {
private static final Logger logger = LoggerFactory.getLogger(AggregateLinkGraphClient.class);
private final GrpcSingleNodeChannelPool<LinkGraphApiBlockingStub> channelPool;
@Inject @Inject
public AggregateDomainLinksClient(GrpcChannelPoolFactory factory) { public AggregateLinkGraphClient(GrpcChannelPoolFactory factory) {
this.channelPool = factory.createSingle( this.channelPool = factory.createSingle(
ServiceKey.forGrpcApi(DomainLinksApiGrpc.class, ServicePartition.any()), ServiceKey.forGrpcApi(LinkGraphApiGrpc.class, ServicePartition.any()),
DomainLinksApiGrpc::newBlockingStub); LinkGraphApiGrpc::newBlockingStub);
} }
public AllLinks getAllDomainLinks() { public AllLinks getAllDomainLinks() {
AllLinks links = new AllLinks(); AllLinks links = new AllLinks();
channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getAllLinks) channelPool.call(LinkGraphApiBlockingStub::getAllLinks)
.run(Empty.getDefaultInstance()) .run(Empty.getDefaultInstance())
.forEachRemaining(pairs -> { .forEachRemaining(pairs -> {
for (int i = 0; i < pairs.getDestIdsCount(); i++) { for (int i = 0; i < pairs.getDestIdsCount(); i++) {
@ -47,7 +47,7 @@ public class AggregateDomainLinksClient {
public List<Integer> getLinksToDomain(int domainId) { public List<Integer> getLinksToDomain(int domainId) {
try { try {
return channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getLinksToDomain) return channelPool.call(LinkGraphApiBlockingStub::getLinksToDomain)
.run(RpcDomainId.newBuilder().setDomainId(domainId).build()) .run(RpcDomainId.newBuilder().setDomainId(domainId).build())
.getDomainIdList() .getDomainIdList()
.stream() .stream()
@ -62,7 +62,7 @@ public class AggregateDomainLinksClient {
public List<Integer> getLinksFromDomain(int domainId) { public List<Integer> getLinksFromDomain(int domainId) {
try { try {
return channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::getLinksFromDomain) return channelPool.call(LinkGraphApiBlockingStub::getLinksFromDomain)
.run(RpcDomainId.newBuilder().setDomainId(domainId).build()) .run(RpcDomainId.newBuilder().setDomainId(domainId).build())
.getDomainIdList() .getDomainIdList()
.stream() .stream()
@ -78,7 +78,7 @@ public class AggregateDomainLinksClient {
public int countLinksToDomain(int domainId) { public int countLinksToDomain(int domainId) {
try { try {
return channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::countLinksToDomain) return channelPool.call(LinkGraphApiBlockingStub::countLinksToDomain)
.run(RpcDomainId.newBuilder().setDomainId(domainId).build()) .run(RpcDomainId.newBuilder().setDomainId(domainId).build())
.getIdCount(); .getIdCount();
@ -91,7 +91,7 @@ public class AggregateDomainLinksClient {
public int countLinksFromDomain(int domainId) { public int countLinksFromDomain(int domainId) {
try { try {
return channelPool.call(DomainLinksApiGrpc.DomainLinksApiBlockingStub::countLinksFromDomain) return channelPool.call(LinkGraphApiBlockingStub::countLinksFromDomain)
.run(RpcDomainId.newBuilder().setDomainId(domainId).build()) .run(RpcDomainId.newBuilder().setDomainId(domainId).build())
.getIdCount(); .getIdCount();
} }

View File

@ -0,0 +1,29 @@
package nu.marginalia.api.linkgraph;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Singleton
public class PartitionLinkGraphClient {
private static final Logger logger = LoggerFactory.getLogger(PartitionLinkGraphClient.class);
private final GrpcMultiNodeChannelPool<LinkGraphApiGrpc.LinkGraphApiBlockingStub> channelPool;
@Inject
public PartitionLinkGraphClient(GrpcChannelPoolFactory factory) {
this.channelPool = factory.createMulti(
ServiceKey.forGrpcApi(LinkGraphApiGrpc.class, ServicePartition.multi()),
LinkGraphApiGrpc::newBlockingStub);
}
public GrpcMultiNodeChannelPool<LinkGraphApiGrpc.LinkGraphApiBlockingStub> getChannelPool() {
return channelPool;
}
}

View File

@ -1,10 +1,10 @@
syntax="proto3"; syntax="proto3";
package nu.marginalia.api.domainlinks; package nu.marginalia.api.linkgraph;
option java_package="nu.marginalia.api.domainlink"; option java_package="nu.marginalia.api.linkgraph";
option java_multiple_files=true; option java_multiple_files=true;
service DomainLinksApi { service LinkGraphApi {
rpc getAllLinks(Empty) returns (stream RpcDomainIdPairs) {} rpc getAllLinks(Empty) returns (stream RpcDomainIdPairs) {}
rpc getLinksFromDomain(RpcDomainId) returns (RpcDomainIdList) {} rpc getLinksFromDomain(RpcDomainId) returns (RpcDomainIdList) {}
rpc getLinksToDomain(RpcDomainId) returns (RpcDomainIdList) {} rpc getLinksToDomain(RpcDomainId) returns (RpcDomainIdList) {}

View File

@ -14,7 +14,7 @@ java {
apply from: "$rootProject.projectDir/srcsets.gradle" apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies { dependencies {
implementation project(':code:functions:domain-links:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:common:service') implementation project(':code:common:service')

View File

@ -1,13 +1,13 @@
package nu.marginalia.linkdb.dlinks; package nu.marginalia.linkgraph;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import java.nio.file.Path; import java.nio.file.Path;
/** A database of source-destination pairs of domain IDs. The database is loaded into memory from /** A repository of source-destination pairs of domain IDs. The database is loaded into memory from
* a source. The database is then kept in memory, reloading it upon switchInput(). * a source. The data is then kept in memory, reloading it upon switchInput().
*/ */
public interface DomainLinkDb { public interface DomainLinks {
/** Replace the current db file with the provided file. The provided file will be deleted. /** Replace the current db file with the provided file. The provided file will be deleted.
* The in-memory database MAY be updated to reflect the change. * The in-memory database MAY be updated to reflect the change.
* */ * */

View File

@ -1,25 +1,28 @@
package nu.marginalia.functions.domainlinks; package nu.marginalia.linkgraph;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.grpc.stub.StreamObserver; import io.grpc.stub.StreamObserver;
import nu.marginalia.api.domainlink.*; import nu.marginalia.api.linkgraph.*;
import nu.marginalia.linkdb.dlinks.DomainLinkDb; import nu.marginalia.api.linkgraph.Empty;
import nu.marginalia.api.linkgraph.LinkGraphApiGrpc;
/** GRPC service for interrogating domain links /** GRPC service for interrogating domain links for a single partition. For accessing the data
* in the application, the AggregateLinkGraphService should be used instead via the
* AggregateLinkGraphClient.
*/ */
public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksApiImplBase { public class PartitionLinkGraphService extends LinkGraphApiGrpc.LinkGraphApiImplBase {
private final DomainLinkDb domainLinkDb; private final DomainLinks domainLinks;
@Inject @Inject
public PartitionDomainLinksService(DomainLinkDb domainLinkDb) { public PartitionLinkGraphService(DomainLinks domainLinks) {
this.domainLinkDb = domainLinkDb; this.domainLinks = domainLinks;
} }
public void getAllLinks(Empty request, public void getAllLinks(Empty request,
io.grpc.stub.StreamObserver<RpcDomainIdPairs> responseObserver) { io.grpc.stub.StreamObserver<RpcDomainIdPairs> responseObserver) {
try (var idsConverter = new AllIdsResponseConverter(responseObserver)) { try (var idsConverter = new AllIdsResponseConverter(responseObserver)) {
domainLinkDb.forEach(idsConverter::accept); domainLinks.forEach(idsConverter::accept);
} }
responseObserver.onCompleted(); responseObserver.onCompleted();
@ -58,7 +61,7 @@ public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
public void getLinksFromDomain(RpcDomainId request, public void getLinksFromDomain(RpcDomainId request,
StreamObserver<RpcDomainIdList> responseObserver) { StreamObserver<RpcDomainIdList> responseObserver) {
var links = domainLinkDb.findDestinations(request.getDomainId()); var links = domainLinks.findDestinations(request.getDomainId());
var rspBuilder = RpcDomainIdList.newBuilder(); var rspBuilder = RpcDomainIdList.newBuilder();
for (int i = 0; i < links.size(); i++) { for (int i = 0; i < links.size(); i++) {
@ -73,7 +76,7 @@ public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
public void getLinksToDomain(RpcDomainId request, public void getLinksToDomain(RpcDomainId request,
StreamObserver<RpcDomainIdList> responseObserver) { StreamObserver<RpcDomainIdList> responseObserver) {
var links = domainLinkDb.findSources(request.getDomainId()); var links = domainLinks.findSources(request.getDomainId());
var rspBuilder = RpcDomainIdList.newBuilder(); var rspBuilder = RpcDomainIdList.newBuilder();
for (int i = 0; i < links.size(); i++) { for (int i = 0; i < links.size(); i++) {
@ -87,7 +90,7 @@ public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
public void countLinksFromDomain(RpcDomainId request, public void countLinksFromDomain(RpcDomainId request,
StreamObserver<RpcDomainIdCount> responseObserver) { StreamObserver<RpcDomainIdCount> responseObserver) {
responseObserver.onNext(RpcDomainIdCount.newBuilder() responseObserver.onNext(RpcDomainIdCount.newBuilder()
.setIdCount(domainLinkDb.countDestinations(request.getDomainId())) .setIdCount(domainLinks.countDestinations(request.getDomainId()))
.build()); .build());
responseObserver.onCompleted(); responseObserver.onCompleted();
} }
@ -95,7 +98,7 @@ public class PartitionDomainLinksService extends DomainLinksApiGrpc.DomainLinksA
public void countLinksToDomain(RpcDomainId request, public void countLinksToDomain(RpcDomainId request,
StreamObserver<RpcDomainIdCount> responseObserver) { StreamObserver<RpcDomainIdCount> responseObserver) {
responseObserver.onNext(RpcDomainIdCount.newBuilder() responseObserver.onNext(RpcDomainIdCount.newBuilder()
.setIdCount(domainLinkDb.countSources(request.getDomainId())) .setIdCount(domainLinks.countSources(request.getDomainId()))
.build()); .build());
responseObserver.onCompleted(); responseObserver.onCompleted();
} }

View File

@ -1,7 +1,8 @@
package nu.marginalia.linkdb.dlinks; package nu.marginalia.linkgraph.impl;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.linkgraph.DomainLinks;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -14,13 +15,13 @@ import java.nio.file.StandardCopyOption;
* is not yet loaded. This speeds up the startup of the index service, as the database is * is not yet loaded. This speeds up the startup of the index service, as the database is
* loaded in a separate thread. * loaded in a separate thread.
*/ */
public class DelayingDomainLinkDb implements DomainLinkDb { public class DelayingDomainLinks implements DomainLinks {
private final static Logger logger = LoggerFactory.getLogger(DelayingDomainLinkDb.class); private final static Logger logger = LoggerFactory.getLogger(DelayingDomainLinks.class);
private volatile DomainLinkDb currentDb; private volatile DomainLinks currentDb;
private final Path filename; private final Path filename;
public DelayingDomainLinkDb(@Named("domain-linkdb-file") Path filename) { public DelayingDomainLinks(@Named("domain-linkdb-file") Path filename) {
this.filename = filename; this.filename = filename;
// Load the database in a separate thread, so that the constructor can return // Load the database in a separate thread, so that the constructor can return
@ -29,7 +30,7 @@ public class DelayingDomainLinkDb implements DomainLinkDb {
Thread.ofPlatform().start(() -> { Thread.ofPlatform().start(() -> {
try { try {
currentDb = new FileDomainLinkDb(filename); currentDb = new FileDomainLinks(filename);
logger.info("Loaded linkdb"); logger.info("Loaded linkdb");
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to load linkdb", e); logger.error("Failed to load linkdb", e);
@ -43,7 +44,7 @@ public class DelayingDomainLinkDb implements DomainLinkDb {
Thread.ofPlatform().start(() -> { Thread.ofPlatform().start(() -> {
try { try {
currentDb = new FileDomainLinkDb(filename); currentDb = new FileDomainLinks(filename);
} catch (IOException e) { } catch (IOException e) {
logger.error("Failed to load linkdb", e); logger.error("Failed to load linkdb", e);
} }

View File

@ -1,7 +1,9 @@
package nu.marginalia.linkdb.dlinks; package nu.marginalia.linkgraph.impl;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.linkgraph.DomainLinks;
import nu.marginalia.linkgraph.io.DomainLinksLoader;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -14,13 +16,13 @@ import java.util.Arrays;
/** Canonical DomainLinkDb implementation. The database is loaded into memory from /** Canonical DomainLinkDb implementation. The database is loaded into memory from
* a file. The database is then kept in memory, reloading it upon switchInput(). * a file. The database is then kept in memory, reloading it upon switchInput().
*/ */
public class FileDomainLinkDb implements DomainLinkDb { public class FileDomainLinks implements DomainLinks {
private static final Logger logger = LoggerFactory.getLogger(FileDomainLinkDb.class); private static final Logger logger = LoggerFactory.getLogger(FileDomainLinks.class);
private final Path filename; private final Path filename;
private volatile long[] sourceToDest = new long[0]; private volatile long[] sourceToDest = new long[0];
private volatile long[] destToSource = new long[0]; private volatile long[] destToSource = new long[0];
public FileDomainLinkDb(@Named("domain-linkdb-file") Path filename) throws IOException { public FileDomainLinks(@Named("domain-linkdb-file") Path filename) throws IOException {
this.filename = filename; this.filename = filename;
if (Files.exists(filename)) { if (Files.exists(filename)) {
@ -35,7 +37,7 @@ public class FileDomainLinkDb implements DomainLinkDb {
} }
public void loadInput(Path filename) throws IOException { public void loadInput(Path filename) throws IOException {
try (var loader = new DomainLinkDbLoader(filename)) { try (var loader = new DomainLinksLoader(filename)) {
int size = loader.size(); int size = loader.size();
var newSourceToDest = new long[size]; var newSourceToDest = new long[size];

View File

@ -1,17 +1,17 @@
package nu.marginalia.linkdb.dlinks; package nu.marginalia.linkgraph.io;
import java.io.DataInputStream; import java.io.DataInputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
public class DomainLinkDbLoader implements AutoCloseable { public class DomainLinksLoader implements AutoCloseable {
private final DataInputStream stream; private final DataInputStream stream;
private final Path filename; private final Path filename;
private long nextVal; private long nextVal;
public DomainLinkDbLoader(Path filename) throws IOException { public DomainLinksLoader(Path filename) throws IOException {
this.stream = new DataInputStream(Files.newInputStream(filename)); this.stream = new DataInputStream(Files.newInputStream(filename));
this.filename = filename; this.filename = filename;
} }

View File

@ -1,4 +1,4 @@
package nu.marginalia.linkdb.dlinks; package nu.marginalia.linkgraph.io;
import java.io.DataOutputStream; import java.io.DataOutputStream;
import java.io.IOException; import java.io.IOException;
@ -6,10 +6,10 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
public class DomainLinkDbWriter implements AutoCloseable { public class DomainLinksWriter implements AutoCloseable {
private final DataOutputStream stream; private final DataOutputStream stream;
public DomainLinkDbWriter(Path fileName) throws IOException { public DomainLinksWriter(Path fileName) throws IOException {
this.stream = new DataOutputStream(Files.newOutputStream(fileName, this.stream = new DataOutputStream(Files.newOutputStream(fileName,
StandardOpenOption.CREATE, StandardOpenOption.CREATE,
StandardOpenOption.WRITE, StandardOpenOption.WRITE,

View File

@ -0,0 +1,11 @@
The link graph partition module is responsible for knowledge about the link graph
for a single index node. It's based on in-memory data structures, and is updated
atomically from file.
## Central Classes
* [PartitionLinkGraphService](java/nu/marginalia/linkgraph/PartitionLinkGraphService.java)
* [DomainLink](java/nu/marginalia/linkgraph/DomainLinks.java)
* * [FileDomainLinks](java/nu/marginalia/linkgraph/impl/FileDomainLinks.java)
* [DomainLinksWriter](java/nu/marginalia/linkgraph/io/DomainLinksWriter.java)
* [DomainLinksLoader](java/nu/marginalia/linkgraph/io/DomainLinksLoader.java)

View File

@ -1,7 +1,7 @@
package nu.marginalia.linkdb; package nu.marginalia.linkgraph;
import nu.marginalia.linkdb.dlinks.DomainLinkDbLoader; import nu.marginalia.linkgraph.io.DomainLinksLoader;
import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter; import nu.marginalia.linkgraph.io.DomainLinksWriter;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
@ -24,7 +24,7 @@ public class DomainLinkDbTest {
@Test @Test
public void testWriteRead() { public void testWriteRead() {
try (var writer = new DomainLinkDbWriter(fileName)) { try (var writer = new DomainLinksWriter(fileName)) {
writer.write(1, 2); writer.write(1, 2);
writer.write(2, 3); writer.write(2, 3);
writer.write(3, 4); writer.write(3, 4);
@ -33,7 +33,7 @@ public class DomainLinkDbTest {
throw new RuntimeException(ex); throw new RuntimeException(ex);
} }
try (var reader = new DomainLinkDbLoader(fileName)) { try (var reader = new DomainLinksLoader(fileName)) {
Assertions.assertTrue(reader.next()); Assertions.assertTrue(reader.next());
Assertions.assertEquals(1, reader.getSource()); Assertions.assertEquals(1, reader.getSource());
Assertions.assertEquals(2, reader.getDest()); Assertions.assertEquals(2, reader.getDest());

View File

@ -0,0 +1,9 @@
The link graph subsystem is responsible for knowledge about the link graph.
A SQL database is not very well suited for this, principally it's too slow to update,
instead the link graph is stored in memory, and atomically updated from file. The storage
aspect is handled by the [common/linkdb](../../common/linkdb/) component.
The link graph subsystem has two components, one which injects into the partitioned services,
e.g. index or execution, and one which aggregates the results from the partitioned services,
and exposes a unified view of the link graph.

View File

@ -5,7 +5,10 @@ import java.nio.ByteBuffer;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
import java.util.BitSet;
// It's unclear why this exists, we should probably use a BitSet instead?
// Chesterton's fence?
public class DenseBitMap { public class DenseBitMap {
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8; public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;

View File

@ -0,0 +1,4 @@
The search query subsystem is responsible for parsing a query,
translating it to a request, and then dispatching it to the
appropriate index nodes and translating the responses back again.

View File

@ -17,7 +17,7 @@ dependencies {
implementation project(':third-party:commons-codec') implementation project(':third-party:commons-codec')
implementation project(':code:index:api') implementation project(':code:index:api')
implementation project(':code:functions:domain-links:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:libraries:array') implementation project(':code:libraries:array')
implementation project(':code:libraries:btree') implementation project(':code:libraries:btree')

View File

@ -3,7 +3,7 @@ package nu.marginalia.ranking.domains.data;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import org.jgrapht.Graph; import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph; import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge; import org.jgrapht.graph.DefaultEdge;
@ -13,12 +13,12 @@ import org.jgrapht.graph.DefaultEdge;
* which is the same as the regular graph except * which is the same as the regular graph except
* the direction of the links have been inverted */ * the direction of the links have been inverted */
public class InvertedLinkGraphSource extends AbstractGraphSource { public class InvertedLinkGraphSource extends AbstractGraphSource {
private final AggregateDomainLinksClient queryClient; private final AggregateLinkGraphClient graphClient;
@Inject @Inject
public InvertedLinkGraphSource(HikariDataSource dataSource, AggregateDomainLinksClient queryClient) { public InvertedLinkGraphSource(HikariDataSource dataSource, AggregateLinkGraphClient graphClient) {
super(dataSource); super(dataSource);
this.queryClient = queryClient; this.graphClient = graphClient;
} }
@SneakyThrows @SneakyThrows
@Override @Override
@ -27,7 +27,7 @@ public class InvertedLinkGraphSource extends AbstractGraphSource {
addVertices(graph); addVertices(graph);
var allLinks = queryClient.getAllDomainLinks(); var allLinks = graphClient.getAllDomainLinks();
var iter = allLinks.iterator(); var iter = allLinks.iterator();
while (iter.advance()) { while (iter.advance()) {
if (!graph.containsVertex(iter.dest())) { if (!graph.containsVertex(iter.dest())) {

View File

@ -3,19 +3,19 @@ package nu.marginalia.ranking.domains.data;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import org.jgrapht.Graph; import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph; import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge; import org.jgrapht.graph.DefaultEdge;
/** A source for the regular link graph. */ /** A source for the regular link graph. */
public class LinkGraphSource extends AbstractGraphSource { public class LinkGraphSource extends AbstractGraphSource {
private final AggregateDomainLinksClient domainLinksClient; private final AggregateLinkGraphClient graphClient;
@Inject @Inject
public LinkGraphSource(HikariDataSource dataSource, AggregateDomainLinksClient domainLinksClient) { public LinkGraphSource(HikariDataSource dataSource, AggregateLinkGraphClient graphClient) {
super(dataSource); super(dataSource);
this.domainLinksClient = domainLinksClient; this.graphClient = graphClient;
} }
@SneakyThrows @SneakyThrows
@ -25,7 +25,7 @@ public class LinkGraphSource extends AbstractGraphSource {
addVertices(graph); addVertices(graph);
var allLinks = domainLinksClient.getAllDomainLinks(); var allLinks = graphClient.getAllDomainLinks();
var iter = allLinks.iterator(); var iter = allLinks.iterator();
while (iter.advance()) { while (iter.advance()) {
if (!graph.containsVertex(iter.dest())) { if (!graph.containsVertex(iter.dest())) {

View File

@ -1,6 +1,6 @@
# Index # Index
This module contains the components that make up the search index. This index subsystem contains the components that make up the search index.
It exposes an API for querying the index, and contains the logic It exposes an API for querying the index, and contains the logic
for ranking search results. It does not parse the query, that is for ranking search results. It does not parse the query, that is
@ -10,9 +10,9 @@ the responsibility of the [search-query](../functions/search-query) module.
There are two indexes with accompanying tools for constructing them. There are two indexes with accompanying tools for constructing them.
* [index-reverse](reverse-index/) is code for `word->document` indexes. There are two such indexes, one containing only document-word pairs that are flagged as important, e.g. the word appears in the title or has a high TF-IDF. This allows good results to be discovered quickly without having to sift through ten thousand bad ones first. * [index-reverse](index-reverse/) is code for `word->document` indexes. There are two such indexes, one containing only document-word pairs that are flagged as important, e.g. the word appears in the title or has a high TF-IDF. This allows good results to be discovered quickly without having to sift through ten thousand bad ones first.
* [index-forward](forward-index/) is the `document->word` index containing metadata about each word, such as its position. It is used after identifying candidate search results via the reverse index to fetch metadata and rank the results. * [index-forward](index-forward/) is the `document->word` index containing metadata about each word, such as its position. It is used after identifying candidate search results via the reverse index to fetch metadata and rank the results.
Additionally, the [index-journal](index-journal/) contains code for constructing a journal of the index, which is used to keep the index up to date. Additionally, the [index-journal](index-journal/) contains code for constructing a journal of the index, which is used to keep the index up to date.

View File

@ -3,7 +3,7 @@ package nu.marginalia.ranking.domains;
import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import nu.marginalia.ranking.domains.data.InvertedLinkGraphSource; import nu.marginalia.ranking.domains.data.InvertedLinkGraphSource;
import nu.marginalia.ranking.domains.data.LinkGraphSource; import nu.marginalia.ranking.domains.data.LinkGraphSource;
import nu.marginalia.ranking.domains.data.SimilarityGraphSource; import nu.marginalia.ranking.domains.data.SimilarityGraphSource;
@ -36,8 +36,8 @@ public class RankingAlgorithmsContainerTest {
static HikariDataSource dataSource; static HikariDataSource dataSource;
AggregateDomainLinksClient domainLinksClient; AggregateLinkGraphClient domainLinksClient;
AggregateDomainLinksClient.AllLinks allLinks; AggregateLinkGraphClient.AllLinks allLinks;
@BeforeAll @BeforeAll
public static void setup() { public static void setup() {
@ -66,8 +66,8 @@ public class RankingAlgorithmsContainerTest {
@BeforeEach @BeforeEach
public void setupQueryClient() { public void setupQueryClient() {
domainLinksClient = Mockito.mock(AggregateDomainLinksClient.class); domainLinksClient = Mockito.mock(AggregateLinkGraphClient.class);
allLinks = new AggregateDomainLinksClient.AllLinks(); allLinks = new AggregateLinkGraphClient.AllLinks();
when(domainLinksClient.getAllDomainLinks()).thenReturn(allLinks); when(domainLinksClient.getAllDomainLinks()).thenReturn(allLinks);
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();

View File

@ -11,6 +11,4 @@ its words, how they stem, POS tags, and so on.
## See Also ## See Also
[features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords [features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords
are important. are important.
[features-qs/query-parser](../../features-qs/query-parser) also does some language processing.

View File

@ -6,7 +6,3 @@ the TF-IDF score of a keyword.
## Central Classes ## Central Classes
* [TermFrequencyDict](java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java) * [TermFrequencyDict](java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java)
## See Also
* [tools/term-frequency-extractor](../../tools/term-frequency-extractor) constructs this file

View File

@ -10,8 +10,8 @@ There are three types of indexes:
This is a very light-weight module that delegates the actual work to the modules: This is a very light-weight module that delegates the actual work to the modules:
* [features-index/index-reverse](../../features-index/index-reverse) * [features-index/index-reverse](../../index/index-reverse)
* [features-index/index-forward](../../features-index/index-forward) * [features-index/index-forward](../../index/index-forward)
Their respective readme files contain more information about the indexes themselves Their respective readme files contain more information about the indexes themselves
and how they are constructed. and how they are constructed.

View File

@ -40,6 +40,8 @@ dependencies {
implementation project(':code:process-models:work-log') implementation project(':code:process-models:work-log')
implementation project(':code:features-convert:keyword-extraction') implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:functions:link-graph:partition')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.guice implementation libs.guice

View File

@ -9,7 +9,7 @@ import com.google.inject.name.Names;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter; import nu.marginalia.linkgraph.io.DomainLinksWriter;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
@ -45,7 +45,7 @@ public class LoaderModule extends AbstractModule {
} }
@Inject @Provides @Singleton @Inject @Provides @Singleton
private DomainLinkDbWriter createDomainLinkdbWriter(FileStorageService service) throws SQLException, IOException { private DomainLinksWriter createDomainLinkdbWriter(FileStorageService service) throws SQLException, IOException {
Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOMAIN_LINKS_FILE_NAME); Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOMAIN_LINKS_FILE_NAME);
@ -53,7 +53,7 @@ public class LoaderModule extends AbstractModule {
Files.delete(dbPath); Files.delete(dbPath);
} }
return new DomainLinkDbWriter(dbPath); return new DomainLinksWriter(dbPath);
} }
private Gson createGson() { private Gson createGson() {

View File

@ -4,7 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader;
import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter; import nu.marginalia.linkgraph.io.DomainLinksWriter;
import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.LoaderInputData;
import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.loading.domains.DomainIdRegistry;
import nu.marginalia.model.processed.DomainLinkRecord; import nu.marginalia.model.processed.DomainLinkRecord;
@ -20,10 +20,10 @@ public class DomainLinksLoaderService {
private static final Logger logger = LoggerFactory.getLogger(DomainLinksLoaderService.class); private static final Logger logger = LoggerFactory.getLogger(DomainLinksLoaderService.class);
private final DomainLinkDbWriter domainLinkDbWriter; private final DomainLinksWriter domainLinkDbWriter;
@Inject @Inject
public DomainLinksLoaderService(DomainLinkDbWriter domainLinkDbWriter) { public DomainLinksLoaderService(DomainLinksWriter domainLinkDbWriter) {
this.domainLinkDbWriter = domainLinkDbWriter; this.domainLinkDbWriter = domainLinkDbWriter;
} }

View File

@ -17,7 +17,7 @@ described in [processed-data](../process-models/processed-data/).
The [loading-process](loading-process/) reads the processed data. The [loading-process](loading-process/) reads the processed data.
It has creates an [index journal](../features-index/index-journal), It has creates an [index journal](../index/index-journal),
a [link database](../common/linkdb), a [link database](../common/linkdb),
and loads domains and domain-links and loads domains and domain-links
into the [MariaDB database](../common/db). into the [MariaDB database](../common/db).

View File

@ -25,7 +25,7 @@ dependencies {
implementation project(':code:common:process') implementation project(':code:common:process')
implementation project(':code:common:service-discovery') implementation project(':code:common:service-discovery')
implementation project(':code:common:service') implementation project(':code:common:service')
implementation project(':code:functions:domain-links:api') implementation project(':code:functions:link-graph:api')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j

View File

@ -4,7 +4,7 @@ import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntObjectHashMap; import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import org.roaringbitmap.RoaringBitmap; import org.roaringbitmap.RoaringBitmap;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -35,7 +35,7 @@ public class AdjacenciesData {
return ret; return ret;
} }
public AdjacenciesData(AggregateDomainLinksClient linksClient, public AdjacenciesData(AggregateLinkGraphClient linksClient,
DomainAliases aliases) { DomainAliases aliases) {
logger.info("Loading adjacency data"); logger.info("Loading adjacency data");

View File

@ -4,7 +4,7 @@ import com.google.inject.Guice;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfiguration;
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
@ -32,7 +32,7 @@ public class WebsiteAdjacenciesCalculator extends ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(WebsiteAdjacenciesCalculator.class); private static final Logger logger = LoggerFactory.getLogger(WebsiteAdjacenciesCalculator.class);
float[] weights; float[] weights;
public WebsiteAdjacenciesCalculator(AggregateDomainLinksClient domainLinksClient, HikariDataSource dataSource) throws SQLException { public WebsiteAdjacenciesCalculator(AggregateLinkGraphClient domainLinksClient, HikariDataSource dataSource) throws SQLException {
this.dataSource = dataSource; this.dataSource = dataSource;
domainAliases = new DomainAliases(dataSource); domainAliases = new DomainAliases(dataSource);
@ -154,7 +154,7 @@ public class WebsiteAdjacenciesCalculator extends ProcessMainClass {
var dataSource = injector.getInstance(HikariDataSource.class); var dataSource = injector.getInstance(HikariDataSource.class);
var lc = injector.getInstance(AggregateDomainLinksClient.class); var lc = injector.getInstance(AggregateLinkGraphClient.class);
if (!lc.waitReady(Duration.ofSeconds(30))) { if (!lc.waitReady(Duration.ofSeconds(30))) {
throw new IllegalStateException("Failed to connect to domain-links"); throw new IllegalStateException("Failed to connect to domain-links");

View File

@ -23,18 +23,38 @@ eligible index services. The control service is responsible for distributing co
service, and for monitoring the health of the system. It also offers a web interface for operating the system. service, and for monitoring the health of the system. It also offers a web interface for operating the system.
### Services ### Services
* [core services](services-core/) Most of these services are stateful, memory hungry, and doing heavy lifting. * [core services](services-core/) Most of these services are stateful, memory hungry, and doing heavy lifting.
* * [control](services-core/control-service) * * [control](services-core/control-service)
* * [query](services-core/query-service) * * [query](services-core/query-service)
* * * Exposes the [functions/link-graph](functions/link-graph) subsystem
* * * Exposes the [functions/search-query](functions/search-query) subsystem
* * [index](services-core/index-service) * * [index](services-core/index-service)
* * * Exposes the [index](index) subsystem
* * * Exposes the [functions/link-graph](functions/link-graph) subsystem
* * [executor](services-core/executor-service) * * [executor](services-core/executor-service)
* * * Exposes the [execution](execution) subsystem
* * [assistant](services-core/assistant-service) * * [assistant](services-core/assistant-service)
* * * Exposes the [functions/math](functions/math) subsystem
* * * Exposes the [functions/domain-info](functions/domain-info) subsystem
* [application services](services-application/) Mostly stateless gateways providing access to the core services. * [application services](services-application/) Mostly stateless gateways providing access to the core services.
* * [api](services-application/api-service) - public API * * [api](services-application/api-service) - public API gateway
* * [search](services-application/search-service) - marginalia search application * * [search](services-application/search-service) - marginalia search application
* * [dating](services-application/dating-service) - [https://explore.marginalia.nu/](https://explore.marginalia.nu/) * * [dating](services-application/dating-service) - [https://explore.marginalia.nu/](https://explore.marginalia.nu/)
* * [explorer](services-application/explorer-service) - [https://explore2.marginalia.nu/](https://explore2.marginalia.nu/) * * [explorer](services-application/explorer-service) - [https://explore2.marginalia.nu/](https://explore2.marginalia.nu/)
* an [internal API](api/)
The system uses a service registry to find the services. The service registry is based on zookeeper,
and is a separate service. The registry doesn't keep track of processes, but APIs. This means that
the system is flexible to reconfiguration. The same code can in principle be run as a micro-service
mesh or as a monolith.
This is an unusual architecture, but it has the benefit that you don't need to think too much about
the layout of the system. You can just request an API and talk to it. Because of this, several of the
services have almost no code of their own. They merely import a library and expose it as a service.
These skeleton services are marked with (S).
Services that expose HTTP endpoints tend to have more code. They are marked with (G).
### Processes ### Processes
@ -55,7 +75,6 @@ but isolated.
* [features-search](features-search) * [features-search](features-search)
* [features-crawl](features-crawl) * [features-crawl](features-crawl)
* [features-convert](features-convert) * [features-convert](features-convert)
* [features-index](features-index)
### Libraries and primitives ### Libraries and primitives

View File

@ -4,8 +4,7 @@ The control service provides an operator's user interface. By default, this int
exposed on port 8081. It does not offer any sort of access control or authentication. exposed on port 8081. It does not offer any sort of access control or authentication.
The control service will itself execute tasks that affect the entire system, but delegate The control service will itself execute tasks that affect the entire system, but delegate
node-specific tasks to the corresponding [executor-service](../executor-service) via the node-specific tasks to the corresponding to the [execution subsystem](../../execution).
[executor-api](../../api/executor-api).
Conceptually the application is broken into three parts: Conceptually the application is broken into three parts:

View File

@ -59,7 +59,7 @@ dependencies {
implementation project(':code:libraries:message-queue') implementation project(':code:libraries:message-queue')
implementation project(':code:functions:domain-links:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:process-models:crawl-spec') implementation project(':code:process-models:crawl-spec')
implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:crawling-model')

View File

@ -1,16 +1,10 @@
The executor service is a partitioned service responsible for executing and keeping The executor service is a partitioned service responsible for executing and keeping
track of long running maintenance and operational tasks, such as crawling or data track of long-running maintenance and operational tasks, such as crawling or data
processing. processing.
It accomplishes this using the [message queue and actor library](../../libraries/message-queue/), The executor service is closely linked to the [control-service](../control-service),
which permits program state to survive crashes and reboots. The executor service is closely which provides a user interface for much of the executor's functionality.
linked to the [control-service](../control-service), which provides a user interface for
much of the executor's functionality.
## Central Classes The service it itself relatively bare of code, but imports and exposes the [execution subsystem](../../execution),
which is responsible for the actual execution of tasks.
* [ExecutorActorControlService](java/nu/marginalia/actor/ExecutorActorControlService.java)
## See Also
* [api/executor-api](../../api/executor-api)

View File

@ -46,8 +46,8 @@ dependencies {
implementation project(':code:common:linkdb') implementation project(':code:common:linkdb')
implementation project(':code:index') implementation project(':code:index')
implementation project(':code:functions:domain-links:partition') implementation project(':code:functions:link-graph:partition')
implementation project(':code:functions:domain-links:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:search-query:api') implementation project(':code:functions:search-query:api')
implementation project(':code:index:api') implementation project(':code:index:api')

View File

@ -4,8 +4,8 @@ import com.google.inject.AbstractModule;
import com.google.inject.Provides; import com.google.inject.Provides;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.linkdb.dlinks.DomainLinkDb; import nu.marginalia.linkgraph.DomainLinks;
import nu.marginalia.linkdb.dlinks.DelayingDomainLinkDb; import nu.marginalia.linkgraph.impl.DelayingDomainLinks;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -26,13 +26,13 @@ public class IndexModule extends AbstractModule {
@Provides @Provides
@Singleton @Singleton
public DomainLinkDb domainLinkDb ( public DomainLinks domainLinkDb (
FileStorageService storageService FileStorageService storageService
) )
{ {
Path path = IndexLocations.getLinkdbLivePath(storageService).resolve(DOMAIN_LINKS_FILE_NAME); Path path = IndexLocations.getLinkdbLivePath(storageService).resolve(DOMAIN_LINKS_FILE_NAME);
return new DelayingDomainLinkDb(path); return new DelayingDomainLinks(path);
} }
@Provides @Provides

View File

@ -3,9 +3,9 @@ package nu.marginalia.index;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.functions.domainlinks.PartitionDomainLinksService; import nu.marginalia.linkgraph.PartitionLinkGraphService;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.linkdb.dlinks.DomainLinkDb; import nu.marginalia.linkgraph.DomainLinks;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.api.IndexMqEndpoints; import nu.marginalia.index.api.IndexMqEndpoints;
@ -34,7 +34,7 @@ public class IndexService extends Service {
private final FileStorageService fileStorageService; private final FileStorageService fileStorageService;
private final DocumentDbReader documentDbReader; private final DocumentDbReader documentDbReader;
private final DomainLinkDb domainLinkDb; private final DomainLinks domainLinks;
private final ServiceEventLog eventLog; private final ServiceEventLog eventLog;
@ -46,21 +46,21 @@ public class IndexService extends Service {
StatefulIndex statefulIndex, StatefulIndex statefulIndex,
FileStorageService fileStorageService, FileStorageService fileStorageService,
DocumentDbReader documentDbReader, DocumentDbReader documentDbReader,
DomainLinkDb domainLinkDb, DomainLinks domainLinks,
PartitionDomainLinksService partitionDomainLinksService, PartitionLinkGraphService partitionLinkGraphService,
ServiceEventLog eventLog) ServiceEventLog eventLog)
{ {
super(params, super(params,
ServicePartition.partition(params.configuration.node()), ServicePartition.partition(params.configuration.node()),
List.of(indexQueryService, List.of(indexQueryService,
partitionDomainLinksService) partitionLinkGraphService)
); );
this.opsService = opsService; this.opsService = opsService;
this.statefulIndex = statefulIndex; this.statefulIndex = statefulIndex;
this.fileStorageService = fileStorageService; this.fileStorageService = fileStorageService;
this.documentDbReader = documentDbReader; this.documentDbReader = documentDbReader;
this.domainLinkDb = domainLinkDb; this.domainLinks = domainLinks;
this.eventLog = eventLog; this.eventLog = eventLog;
this.init = params.initialization; this.init = params.initialization;
@ -106,7 +106,7 @@ public class IndexService extends Service {
if (Files.exists(newPathDomains)) { if (Files.exists(newPathDomains)) {
eventLog.logEvent("INDEX-SWITCH-DOMAIN-LINKDB", ""); eventLog.logEvent("INDEX-SWITCH-DOMAIN-LINKDB", "");
domainLinkDb.switchInput(newPathDomains); domainLinks.switchInput(newPathDomains);
} }
} }

View File

@ -50,8 +50,8 @@ dependencies {
implementation project(':code:functions:search-query') implementation project(':code:functions:search-query')
implementation project(':code:functions:search-query:api') implementation project(':code:functions:search-query:api')
implementation project(':code:functions:domain-links:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:domain-links:aggregate') implementation project(':code:functions:link-graph:aggregate')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j

View File

@ -2,7 +2,7 @@ package nu.marginalia.query;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.functions.domainlinks.AggregateDomainLinksService; import nu.marginalia.linkgraph.AggregateLinkGraphService;
import nu.marginalia.functions.searchquery.QueryGRPCService; import nu.marginalia.functions.searchquery.QueryGRPCService;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams; import nu.marginalia.service.server.BaseServiceParams;
@ -17,7 +17,7 @@ public class QueryService extends Service {
@SneakyThrows @SneakyThrows
@Inject @Inject
public QueryService(BaseServiceParams params, public QueryService(BaseServiceParams params,
AggregateDomainLinksService domainLinksService, AggregateLinkGraphService domainLinksService,
QueryGRPCService queryGRPCService, QueryGRPCService queryGRPCService,
QueryBasicInterface queryBasicInterface) QueryBasicInterface queryBasicInterface)
{ {

View File

@ -17,9 +17,9 @@ include 'code:functions:math:api'
include 'code:functions:domain-info' include 'code:functions:domain-info'
include 'code:functions:domain-info:api' include 'code:functions:domain-info:api'
include 'code:functions:domain-links:partition' include 'code:functions:link-graph:partition'
include 'code:functions:domain-links:aggregate' include 'code:functions:link-graph:aggregate'
include 'code:functions:domain-links:api' include 'code:functions:link-graph:api'
include 'code:functions:search-query' include 'code:functions:search-query'
include 'code:functions:search-query:api' include 'code:functions:search-query:api'