Merge branch 'master' into warc

This commit is contained in:
Viktor Lofgren 2023-12-11 14:30:20 +01:00
commit 45987a1d98
57 changed files with 989 additions and 804 deletions

View File

@ -1,12 +1,14 @@
package nu.marginalia.assistant.client; package nu.marginalia.assistant.client;
import com.google.gson.reflect.TypeToken;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.assistant.client.model.DictionaryResponse; import nu.marginalia.assistant.client.model.DictionaryResponse;
import nu.marginalia.assistant.client.model.DomainInformation;
import nu.marginalia.assistant.client.model.SimilarDomain;
import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.client.AbstractDynamicClient;
import nu.marginalia.client.exception.RouteNotConfiguredException; import nu.marginalia.client.exception.RouteNotConfiguredException;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.descriptor.ServiceDescriptors;
import nu.marginalia.service.id.ServiceId; import nu.marginalia.service.id.ServiceId;
@ -14,6 +16,7 @@ import nu.marginalia.client.Context;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List; import java.util.List;
@Singleton @Singleton
@ -59,4 +62,31 @@ public class AssistantClient extends AbstractDynamicClient {
return Observable.empty(); return Observable.empty();
} }
} }
public Observable<ArrayList<SimilarDomain>> similarDomains(Context ctx, int domainId, int count) {
try {
return super.get(ctx, 0, STR."/domain/\{domainId}/similar?count=\{count}", new TypeToken<ArrayList<SimilarDomain>>() {});
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
public Observable<ArrayList<SimilarDomain>> linkedDomains(Context ctx, int domainId, int count) {
try {
return super.get(ctx, 0, STR."/domain/\{domainId}/linking?count=\{count}", new TypeToken<ArrayList<SimilarDomain>>() {});
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
public Observable<DomainInformation> domainInformation(Context ctx, int domainId) {
try {
return super.get(ctx, 0, STR."/domain/\{domainId}/info", DomainInformation.class);
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
} }

View File

@ -0,0 +1,42 @@
package nu.marginalia.assistant.client.model;
import lombok.*;
import nu.marginalia.model.EdgeDomain;
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
@ToString
public class DomainInformation {
EdgeDomain domain;
boolean blacklisted;
int pagesKnown;
int pagesFetched;
int pagesIndexed;
int incomingLinks;
int outboundLinks;
int nodeAffinity;
double ranking;
boolean suggestForCrawling;
boolean inCrawlQueue;
boolean unknownDomain;
String ip;
String ipCountry;
String state;
public String getIpFlag() {
if (ipCountry == null || ipCountry.isBlank()) {
return "";
}
String country = ipCountry;
if ("UK".equals(country)) {
return "GB";
}
int offset = 0x1F1E6;
int asciiOffset = 0x41;
int firstChar = Character.codePointAt(country, 0) - asciiOffset + offset;
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.assistant.client.model;
import nu.marginalia.model.EdgeUrl;
public record SimilarDomain(EdgeUrl url,
int domainId,
double relatedness,
double rank,
boolean indexed,
boolean active,
boolean screenshot,
LinkType linkType) {
public String getRankSymbols() {
if (rank > 90) {
return "&#9733;&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 70) {
return "&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 50) {
return "&#9733;&#9733;&#9733;";
}
if (rank > 30) {
return "&#9733;&#9733;";
}
if (rank > 10) {
return "&#9733;";
}
return "";
}
public enum LinkType {
BACKWARD,
FOWARD,
BIDIRECTIONAL,
NONE;
public static LinkType find(boolean linkStod,
boolean linkDtos) {
if (linkDtos && linkStod)
return BIDIRECTIONAL;
if (linkDtos)
return FOWARD;
if (linkStod)
return BACKWARD;
return NONE;
}
public String toString() {
return switch (this) {
case FOWARD -> "&#8594;";
case BACKWARD -> "&#8592;";
case BIDIRECTIONAL -> "&#8646;";
case NONE -> "-";
};
}
public String getDescription() {
return switch (this) {
case BACKWARD -> "Backward Link";
case FOWARD -> "Forward Link";
case BIDIRECTIONAL -> "Mutual Link";
case NONE -> "No Link";
};
}
}
}

View File

@ -34,8 +34,23 @@ public class ExecutorRemoteActorFactory {
} }
public interface ExecutorRemoteActorIf<T> { public interface ExecutorRemoteActorIf<T> {
boolean trigger(T object) throws Exception;
String getState(); /** Trigger the remote actor with the given object. The object will be serialized to JSON and sent to the
* remote actor. If the remote actor does not respond after a time period, a timeout will occur and a negative
* message id will be returned.
*
* @param object The message to send to the remote actot
* @return The message id of the response message, or a negative number if the remote actor did not respond
* within a reasonable timeout seconds.
*/
long trigger(T object) throws Exception;
/** Get the last finished state of the actor.
* <p>
* The message id of the request initiating the actor must be provided to ensure that
* we don't get a state from a previous run.
*/
String getState(long fromMsgId);
} }
public record CrawlData(FileStorageId storageId, boolean cascadeLoad) {} public record CrawlData(FileStorageId storageId, boolean cascadeLoad) {}
@ -58,11 +73,11 @@ class ExecutorRemoteActor<T> implements ExecutorRemoteActorFactory.ExecutorRemot
this.triggerFunction = triggerFunction; this.triggerFunction = triggerFunction;
} }
public boolean trigger(T object) throws Exception { public long trigger(T object) throws Exception {
return trigger(gson.toJson(object)); return trigger(gson.toJson(object));
} }
public boolean trigger(String payload) throws Exception { public long trigger(String payload) throws Exception {
long id = persistence.sendNewMessage(inboxName, null, null, triggerFunction, payload, null); long id = persistence.sendNewMessage(inboxName, null, null, triggerFunction, payload, null);
// Wait for the remote actor to respond to the message // Wait for the remote actor to respond to the message
@ -70,19 +85,19 @@ class ExecutorRemoteActor<T> implements ExecutorRemoteActorFactory.ExecutorRemot
for (int i = 0; i < 120; i++) { for (int i = 0; i < 120; i++) {
var msg = persistence.getMessage(id); var msg = persistence.getMessage(id);
if (msg.state() == MqMessageState.ACK || msg.state() == MqMessageState.OK) if (msg.state() == MqMessageState.ACK || msg.state() == MqMessageState.OK)
return true; return id;
if (msg.state() == MqMessageState.ERR || msg.state() == MqMessageState.DEAD) if (msg.state() == MqMessageState.ERR || msg.state() == MqMessageState.DEAD)
return false; return -id;
TimeUnit.SECONDS.sleep(1); TimeUnit.SECONDS.sleep(1);
} }
return false; // Timeout return -1; // Timeout
} }
public String getState() { public String getState(long fromMsgId) {
return persistence return persistence
.getHeadMessage(inboxName) .getHeadMessage(inboxName, fromMsgId)
.map(MqMessage::function) .map(MqMessage::function)
.orElse("INITIAL"); .orElse("INITIAL");
} }

View File

@ -7,7 +7,7 @@ package nu.marginalia.index.client.model.query;
* */ * */
public enum SearchSetIdentifier { public enum SearchSetIdentifier {
NONE, NONE,
RETRO, POPULAR,
BLOGS, BLOGS,
ACADEMIA, ACADEMIA,
SMALLWEB SMALLWEB

View File

@ -1,6 +1,5 @@
package nu.marginalia.client; package nu.marginalia.client;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import io.reactivex.rxjava3.core.Scheduler; import io.reactivex.rxjava3.core.Scheduler;
import io.reactivex.rxjava3.schedulers.Schedulers; import io.reactivex.rxjava3.schedulers.Schedulers;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -10,26 +9,16 @@ import javax.annotation.Nonnull;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.ThreadFactory;
public class AbortingScheduler { public class AbortingScheduler {
private final ThreadFactory threadFactory;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@Nullable @Nullable
private ExecutorService executorService; private ExecutorService executorService;
public AbortingScheduler(String name) { public AbortingScheduler() {
threadFactory = new ThreadFactoryBuilder()
.setNameFormat(name+"client--%d")
.setUncaughtExceptionHandler(this::handleException)
.build();
} }
private void handleException(Thread thread, Throwable throwable) {
logger.error("Uncaught exception during Client IO in thread {}", thread.getName(), throwable);
}
public synchronized Scheduler get() { public synchronized Scheduler get() {
return Schedulers.from(getExecutorService(), return Schedulers.from(getExecutorService(),
@ -40,14 +29,14 @@ public class AbortingScheduler {
public synchronized void abort() { public synchronized void abort() {
if (null != executorService) { if (null != executorService) {
executorService.shutdownNow(); executorService.shutdownNow();
executorService = Executors.newFixedThreadPool(16, threadFactory); executorService = Executors.newVirtualThreadPerTaskExecutor();
} }
} }
@Nonnull @Nonnull
private synchronized ExecutorService getExecutorService() { private synchronized ExecutorService getExecutorService() {
if (null == executorService) { if (null == executorService) {
executorService = Executors.newFixedThreadPool(16, threadFactory); executorService = Executors.newVirtualThreadPerTaskExecutor();
} }
return executorService; return executorService;
} }

View File

@ -1,6 +1,7 @@
package nu.marginalia.client; package nu.marginalia.client;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import com.google.protobuf.GeneratedMessageV3; import com.google.protobuf.GeneratedMessageV3;
import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.core.Observable;
import io.reactivex.rxjava3.core.ObservableSource; import io.reactivex.rxjava3.core.ObservableSource;
@ -20,6 +21,7 @@ import org.slf4j.LoggerFactory;
import spark.utils.IOUtils; import spark.utils.IOUtils;
import java.io.OutputStream; import java.io.OutputStream;
import java.lang.reflect.Type;
import java.net.ConnectException; import java.net.ConnectException;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.function.Supplier; import java.util.function.Supplier;
@ -233,6 +235,22 @@ public abstract class AbstractClient implements AutoCloseable {
.doFinally(() -> ThreadContext.remove("outbound-request")); .doFinally(() -> ThreadContext.remove("outbound-request"));
} }
protected synchronized <T> Observable<T> get(Context ctx, int node, String endpoint, TypeToken<T> type) {
ensureAlive(node);
var req = ctx.paint(new Request.Builder()).url(serviceRoutes.get(node) + endpoint).get().build();
return Observable.just(client.newCall(req))
.subscribeOn(scheduler().get())
.map(this::logInbound)
.map(Call::execute)
.map(this::logOutbound)
.map(rsp -> validateResponseStatus(rsp, req, 200))
.map(rsp -> getEntity(rsp, type))
.retryWhen(this::retryHandler)
.timeout(timeout, TimeUnit.SECONDS)
.doFinally(() -> ThreadContext.remove("outbound-request"));
}
protected synchronized Observable<Integer> get(Context ctx, int node, String endpoint, OutputStream outputStream) { protected synchronized Observable<Integer> get(Context ctx, int node, String endpoint, OutputStream outputStream) {
ensureAlive(node); ensureAlive(node);
@ -388,6 +406,15 @@ public abstract class AbstractClient implements AutoCloseable {
} }
} }
@SneakyThrows @SneakyThrows
private <T> T getEntity(Response response, TypeToken<T> clazz) {
try (response) {
return gson.fromJson(response.body().charStream(), clazz);
}
catch (Exception ex) {
throw ex;
}
}
@SneakyThrows
private String getText(Response response) { private String getText(Response response) {
try (response) { try (response) {
return response.body().string(); return response.body().string();

View File

@ -1,8 +1,6 @@
package nu.marginalia.client; package nu.marginalia.client;
import com.google.gson.Gson; import com.google.gson.Gson;
import nu.marginalia.client.route.RouteProvider;
import nu.marginalia.client.route.ServiceRoute;
import nu.marginalia.service.descriptor.ServiceDescriptor; import nu.marginalia.service.descriptor.ServiceDescriptor;
import javax.annotation.Nonnull; import javax.annotation.Nonnull;
@ -20,7 +18,7 @@ public class AbstractDynamicClient extends AbstractClient {
); );
this.service = service; this.service = service;
this.scheduler = new AbortingScheduler(name()); this.scheduler = new AbortingScheduler();
} }
@Override @Override

View File

@ -42,7 +42,7 @@ public class AbstractClientTest {
client = new AbstractClient(new RouteProvider(new ServiceDescriptor(ServiceId.Api, "localhost")), 1, Gson::new) { client = new AbstractClient(new RouteProvider(new ServiceDescriptor(ServiceId.Api, "localhost")), 1, Gson::new) {
@Override @Override
public AbortingScheduler scheduler() { public AbortingScheduler scheduler() {
return new AbortingScheduler(name()); return new AbortingScheduler();
} }
@Override @Override

View File

@ -15,6 +15,7 @@ dependencies {
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:guarded-regex')
implementation project(':code:libraries:geo-ip')
implementation libs.notnull implementation libs.notnull

View File

@ -1,72 +1,31 @@
package nu.marginalia.ip_blocklist; package nu.marginalia.ip_blocklist;
import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.opencsv.CSVReader; import nu.marginalia.geoip.GeoIpDictionary;
import com.opencsv.exceptions.CsvValidationException;
import lombok.AllArgsConstructor;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
import java.net.InetAddress;
import java.util.Set; import java.util.Set;
import java.util.TreeMap;
@Singleton @Singleton
public class GeoIpBlocklist { public class GeoIpBlocklist {
private final TreeMap<Long, GeoIpBlocklist.IpRange> ranges = new TreeMap<>(); /** These countries are extremely overrepresented among the problematic and spammy domains,
* and blocking them is by far the most effective spam mitigation technique. Sucks we throw
* babies out with the bathwater, but it's undeniably effective.
*/
private final Set<String> blacklist = Set.of("CN", "HK"); private final Set<String> blacklist = Set.of("CN", "HK");
private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA"); private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA");
private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class); private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class);
@AllArgsConstructor private final GeoIpDictionary ipDictionary;
static class IpRange {
public final long from;
public final long to;
public final String country;
}
public GeoIpBlocklist() throws IOException, CsvValidationException { @Inject
var resource = WmsaHome.getIPLocationDatabse(); public GeoIpBlocklist(GeoIpDictionary ipDictionary) {
this.ipDictionary = ipDictionary;
try (var reader = new CSVReader(new FileReader(resource.toFile()))) { ipDictionary.waitReady();
for (;;) {
String[] vals = reader.readNext();
if (vals == null) {
break;
}
if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) {
continue;
}
var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]),
Long.parseLong(vals[1]),
vals[2]);
ranges.put(range.from, range);
}
}
logger.info("Loaded {} IP ranges", ranges.size());
}
public String getCountry(InetAddress address) {
byte[] bytes = address.getAddress();
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
Long key = ranges.floorKey(ival);
if (null == key) {
return "-";
}
var range = ranges.get(key);
if (ival >= key && ival < range.to) {
return range.country;
}
return "-";
} }
public boolean isAllowed(EdgeDomain domain) { public boolean isAllowed(EdgeDomain domain) {
@ -84,7 +43,7 @@ public class GeoIpBlocklist {
public String getCountry(EdgeDomain domain) { public String getCountry(EdgeDomain domain) {
try { try {
return getCountry(InetAddressCache.getAddress(domain)); return ipDictionary.getCountry(InetAddressCache.getAddress(domain));
} }
catch (Throwable ex) { catch (Throwable ex) {
logger.debug("Failed to resolve {}", domain); logger.debug("Failed to resolve {}", domain);

View File

@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit;
// We don't want to torture the DNS by resolving the same links over and over and over again // We don't want to torture the DNS by resolving the same links over and over and over again
public class InetAddressCache { public class InetAddressCache {
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
public static InetAddress getAddress(EdgeDomain domain) throws Throwable { public static InetAddress getAddress(EdgeDomain domain) throws Throwable {
try { try {
return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress())); return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress()));

View File

@ -0,0 +1,139 @@
package nu.marginalia.util;
import lombok.SneakyThrows;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;
/**
* Abstraction for exposing a (typically) read-from-disk -> parallel processing -> sequential output
* workflow as an iterator, where the number of tasks is much larger than the number of cores
*/
public class ProcessingIterator<T> implements Iterator<T> {
private static final Logger logger = LoggerFactory.getLogger(ProcessingIterator.class);
private final LinkedBlockingQueue<T> queue;
private final AtomicBoolean isFinished = new AtomicBoolean(false);
private final ExecutorService executorService;
private final Semaphore sem;
private T next = null;
private final int parallelism;
public ProcessingIterator(int queueSize, int parallelism, ProcessingJob<T> task) {
this.parallelism = parallelism;
queue = new LinkedBlockingQueue<>(queueSize);
executorService = Executors.newFixedThreadPool(parallelism);
sem = new Semaphore(parallelism);
executorService.submit(() -> executeJob(task));
}
private void executeJob(ProcessingJob<T> job) {
try {
job.run(this::executeTask);
} catch (Exception e) {
logger.warn("Exception while processing", e);
} finally {
isFinished.set(true);
}
}
private void executeTask(Task<T> task) {
try {
sem.acquire();
} catch (InterruptedException e) {
return;
}
try {
queue.put(task.get());
} catch (Exception e) {
logger.warn("Exception while processing", e);
} finally {
sem.release();
}
}
/** Returns true if there are more documents to be processed.
* This method may block until we are certain this is true.
* <p>
* This method must be invoked from the same thread that invokes next(),
* (or synchronize between the two)
*/
@Override
@SneakyThrows
public boolean hasNext() {
if (next != null)
return true;
do {
next = queue.poll(1, TimeUnit.SECONDS);
if (next != null) {
return true;
}
} while (expectMore());
if (!executorService.isShutdown()) {
executorService.shutdown();
}
return false;
}
/** Heuristic for if we should expect more documents to be processed,
* _trust but verify_ since we don't run this in an exclusive section
* and may get a false positive. We never expect a false negative though.
*/
private boolean expectMore() {
return !isFinished.get() // we are still reading from the database
|| !queue.isEmpty() // ... or we have documents in the queue
|| sem.availablePermits() < parallelism; // ... or we are still processing documents
}
/** Returns the next document to be processed.
* This method may block until we are certain there is a document to be processed.
* <p>
* This method must be invoked from the same thread that invokes hasNext(),
* (or synchronize between the two)
* <p>
* If this is run after hasNext() returns false, a NoSuchElementException is thrown.
*/
@SneakyThrows
@Override
public T next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
try {
return next;
}
finally {
next = null;
}
}
/**
* A job that produces a sequence of processing tasks that are to be
* performed in parallel
*/
public interface ProcessingJob<T2> {
void run(Consumer<Task<T2>> output) throws Exception;
}
/**
* A single task that produces a result to be iterable via the Iterator interface
* (along with other tasks' outputs)
*/
public interface Task<T> {
T get() throws Exception;
}
}

View File

@ -0,0 +1,24 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
dependencies {
implementation project(':code:common:config')
implementation libs.bundles.slf4j
implementation libs.opencsv
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,6 @@
This micro library handles the GeoIP lookups, mappings from IP addresses
to country codes.
It uses the free ip2location lite database, which is
available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country)
under a CC-BY-SA 4.0 license.

View File

@ -0,0 +1,90 @@
package nu.marginalia.geoip;
import com.opencsv.CSVReader;
import nu.marginalia.WmsaHome;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.net.InetAddress;
import java.util.TreeMap;
public class GeoIpDictionary {
private volatile TreeMap<Long, IpRange> ranges = null;
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
record IpRange(long from, long to, String country) {}
public GeoIpDictionary() {
Thread.ofPlatform().start(() -> {
try (var reader = new CSVReader(new FileReader(WmsaHome.getIPLocationDatabse().toFile()))) {
var dict = new TreeMap<Long, IpRange>();
for (;;) {
String[] vals = reader.readNext();
if (vals == null) {
break;
}
var range = new IpRange(Long.parseLong(vals[0]),
Long.parseLong(vals[1]),
vals[2]);
dict.put(range.from, range);
}
ranges = dict;
logger.info("Loaded {} IP ranges", ranges.size());
} catch (Exception e) {
ranges = new TreeMap<>();
throw new RuntimeException(e);
}
finally {
this.notifyAll();
}
});
}
public boolean isReady() {
return null != ranges;
}
public boolean waitReady() {
while (null == ranges) {
try {
synchronized (this) {
this.wait(1000);
}
} catch (InterruptedException e) {
return false;
}
}
return true;
}
public String getCountry(String ip) {
try {
return getCountry(InetAddress.getByName(ip));
} catch (Exception e) {
return "";
}
}
public String getCountry(InetAddress address) {
if (null == ranges) { // not loaded yet or failed to load
return "";
}
byte[] bytes = address.getAddress();
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
Long key = ranges.floorKey(ival);
if (null == key) {
return "";
}
var range = ranges.get(key);
if (ival >= key && ival < range.to) {
return range.country;
}
return "";
}
}

View File

@ -496,17 +496,21 @@ public class MqPersistence {
return gson; return gson;
} }
/** Returns the last message sent to this inbox with a state of 'OK' */ /** Returns the last message sent to this inbox with a state of 'OK'
public Optional<MqMessage> getHeadMessage(String inboxName) { * with an id greater than or equal to fromMsgId
*/
public Optional<MqMessage> getHeadMessage(String inboxName, long fromMsgId) {
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var query = conn.prepareStatement(""" var query = conn.prepareStatement("""
SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX
FROM MESSAGE_QUEUE FROM MESSAGE_QUEUE
WHERE RECIPIENT_INBOX = ? AND STATE='OK' WHERE RECIPIENT_INBOX = ? AND STATE='OK' AND ID >= ?
ORDER BY ID DESC LIMIT 1 ORDER BY ID DESC LIMIT 1
""")) """))
{ {
query.setString(1, inboxName); query.setString(1, inboxName);
query.setLong(2, fromMsgId);
var rs = query.executeQuery(); var rs = query.executeQuery();
if (rs.next()) { if (rs.next()) {
long msgId = rs.getLong(1); long msgId = rs.getLong(1);

View File

@ -4,6 +4,7 @@ import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.bigstring.BigString; import nu.marginalia.bigstring.BigString;
import nu.marginalia.model.EdgeUrl;
@Builder @Builder
@AllArgsConstructor @AllArgsConstructor
@ -35,4 +36,16 @@ public class CrawledDocument implements SerializableCrawlData {
return SERIAL_IDENTIFIER; return SERIAL_IDENTIFIER;
} }
@Override
public String getDomain() {
if (url == null)
return null;
return EdgeUrl
.parse(url)
.map(EdgeUrl::getDomain)
.map(d -> d.domain)
.orElse(null);
}
} }

View File

@ -2,4 +2,5 @@ package nu.marginalia.crawling.model;
public interface SerializableCrawlData { public interface SerializableCrawlData {
String getSerialIdentifier(); String getSerialIdentifier();
String getDomain();
} }

View File

@ -3,6 +3,7 @@ package nu.marginalia.io.processed;
import blue.strategic.parquet.HydratorSupplier; import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader; import blue.strategic.parquet.ParquetReader;
import nu.marginalia.model.processed.DomainRecord; import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.model.processed.DomainWithIp;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import java.io.IOException; import java.io.IOException;
@ -19,10 +20,10 @@ public class DomainRecordParquetFileReader {
} }
@NotNull @NotNull
public static List<String> getDomainNames(Path path) throws IOException { public static List<DomainWithIp> getBasicDomainInformation(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(), return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()), HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()),
List.of("domain")) List.of("domain", "ip"))
.toList(); .toList();
} }

View File

@ -8,7 +8,6 @@ import org.apache.parquet.schema.*;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
import java.sql.Array;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -48,8 +47,8 @@ public class DomainRecord {
return DomainRecord::dehydrate; return DomainRecord::dehydrate;
} }
public static Hydrator<String, String> newDomainNameHydrator() { public static Hydrator<DomainWithIp, DomainWithIp> newDomainNameHydrator() {
return new DomainNameHydrator(); return new DomainWithIpHydrator();
} }
@ -124,23 +123,26 @@ class DomainHydrator implements Hydrator<DomainRecord, DomainRecord> {
} }
} }
class DomainNameHydrator implements Hydrator<String, String> { class DomainWithIpHydrator implements Hydrator<DomainWithIp, DomainWithIp> {
@Override @Override
public String start() { public DomainWithIp start() {
return ""; return new DomainWithIp();
} }
@Override @Override
public String add(String target, String heading, Object value) { public DomainWithIp add(DomainWithIp target, String heading, Object value) {
if ("domain".equals(heading)) { if ("domain".equals(heading)) {
return (String) value; target.domain = (String) value;
}
else if ("ip".equals(heading)) {
target.ip = (String) value;
} }
return target; return target;
} }
@Override @Override
public String finish(String target) { public DomainWithIp finish(DomainWithIp target) {
return target; return target;
} }
} }

View File

@ -0,0 +1,15 @@
package nu.marginalia.model.processed;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.ToString;
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode
@ToString
public class DomainWithIp {
public String domain;
public String ip;
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.io.processed; package nu.marginalia.io.processed;
import nu.marginalia.model.processed.DomainRecord; import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.model.processed.DomainWithIp;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -53,8 +54,11 @@ class DomainRecordParquetFileReaderTest {
writer.write(second); writer.write(second);
} }
var domainNames = DomainRecordParquetFileReader.getDomainNames(parquetFile); var domainInfo = DomainRecordParquetFileReader.getBasicDomainInformation(parquetFile);
assertEquals(List.of("www.marginalia.nu", "memex.marginalia.nu"), domainNames); assertEquals(List.of(
new DomainWithIp("www.marginalia.nu", "127.0.0.1"),
new DomainWithIp("memex.marginalia.nu", "127.0.0.1")),
domainInfo);
var items = DomainRecordParquetFileReader var items = DomainRecordParquetFileReader
.stream(parquetFile) .stream(parquetFile)

View File

@ -41,6 +41,7 @@ dependencies {
implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:guarded-regex')
implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:big-string') implementation project(':code:libraries:big-string')
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')

View File

@ -138,10 +138,16 @@ public class ConverterMain {
for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id))) for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id)))
{ {
pool.submit(() -> { pool.submit(() -> {
ProcessedDomain processed = processor.process(domain); try {
converterWriter.accept(processed); ProcessedDomain processed = processor.process(domain);
converterWriter.accept(processed);
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); }
catch (Exception ex) {
logger.info("Error in processing", ex);
}
finally {
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
}
}); });
} }

View File

@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.logic.links.LinkGraph; import nu.marginalia.converting.processor.logic.links.LinkGraph;
import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.*; import nu.marginalia.crawling.model.*;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -22,6 +23,7 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.*; import java.util.*;
import java.util.regex.Pattern;
public class DomainProcessor { public class DomainProcessor {
private final DocumentProcessor documentProcessor; private final DocumentProcessor documentProcessor;
@ -29,6 +31,7 @@ public class DomainProcessor {
private final AnchorTagsSource anchorTagsSource; private final AnchorTagsSource anchorTagsSource;
private final AnchorTextKeywords anchorTextKeywords; private final AnchorTextKeywords anchorTextKeywords;
private final LshDocumentDeduplicator documentDeduplicator; private final LshDocumentDeduplicator documentDeduplicator;
private final GeoIpDictionary geoIpDictionary;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@ -37,13 +40,16 @@ public class DomainProcessor {
SiteWords siteWords, SiteWords siteWords,
AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords, AnchorTextKeywords anchorTextKeywords,
LshDocumentDeduplicator documentDeduplicator) throws SQLException LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException
{ {
this.documentProcessor = documentProcessor; this.documentProcessor = documentProcessor;
this.siteWords = siteWords; this.siteWords = siteWords;
this.anchorTextKeywords = anchorTextKeywords; this.anchorTextKeywords = anchorTextKeywords;
this.documentDeduplicator = documentDeduplicator; this.documentDeduplicator = documentDeduplicator;
this.anchorTagsSource = anchorTagsSourceFactory.create(); this.anchorTagsSource = anchorTagsSourceFactory.create();
this.geoIpDictionary = geoIpDictionary;
geoIpDictionary.waitReady();
} }
@SneakyThrows @SneakyThrows
@ -54,11 +60,21 @@ public class DomainProcessor {
boolean cookies = false; boolean cookies = false;
String ip = ""; String ip = "";
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain); DomainLinks externalDomainLinks = null;
while (dataStream.hasNext()) { while (dataStream.hasNext()) {
var data = dataStream.next(); var data = dataStream.next();
// Do a lazy load of the external domain links since we don't know the domain
// until we see the first document
if (externalDomainLinks == null) {
var domain = data.getDomain();
if (domain != null) {
externalDomainLinks = anchorTagsSource.getAnchorTags(domain);
}
}
if (data instanceof CrawledDomain crawledDomain) { if (data instanceof CrawledDomain crawledDomain) {
ret.domain = new EdgeDomain(crawledDomain.domain); ret.domain = new EdgeDomain(crawledDomain.domain);
ret.ip = crawledDomain.ip; ret.ip = crawledDomain.ip;
@ -76,8 +92,15 @@ public class DomainProcessor {
try { try {
if (doc.url == null) if (doc.url == null)
continue; continue;
fixBadCanonicalTag(doc); fixBadCanonicalTag(doc);
// This case should never be reachable, as we should have initiated
// the externalDomainLinks variable above if we made it past the
// doc.url == null check; but we'll leave it here just in case
// to make debugging easier if we break this.
assert externalDomainLinks != null : "externalDomainLinks has not been initialized";
docs.add(documentProcessor.process(doc, externalDomainLinks)); docs.add(documentProcessor.process(doc, externalDomainLinks));
} }
catch (Exception ex) { catch (Exception ex) {
@ -89,11 +112,22 @@ public class DomainProcessor {
// Add late keywords and features from domain-level information // Add late keywords and features from domain-level information
List<String> terms = new ArrayList<>(); List<String> terms = new ArrayList<>();
terms.add("ip:"+ip); terms.add("ip:"+ip);
String ipCountryCode = geoIpDictionary.getCountry(ip).toLowerCase();
if (!ipCountryCode.isBlank()) {
terms.add("ip:"+ipCountryCode);
}
if (cookies) { if (cookies) {
terms.add(HtmlFeature.COOKIES.getKeyword()); terms.add(HtmlFeature.COOKIES.getKeyword());
} }
if (isAcademicDomain(ret.domain)) {
terms.add("special:academia");
}
for (var document : ret.documents) { for (var document : ret.documents) {
if (document.details == null) if (document.details == null)
continue; continue;
@ -114,6 +148,19 @@ public class DomainProcessor {
return ret; return ret;
} }
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
private boolean isAcademicDomain(EdgeDomain domain) {
if (domain.domain.endsWith(".edu"))
return true;
if (academicPattern.matcher(domain.domain).matches())
return true;
return false;
}
private void fixBadCanonicalTag(CrawledDocument doc) { private void fixBadCanonicalTag(CrawledDocument doc) {
// Some sites have a canonical tag that points to a different domain, // Some sites have a canonical tag that points to a different domain,
// but our loader can not support this, so we point these back to the // but our loader can not support this, so we point these back to the

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory; import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader; import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
@ -21,6 +22,7 @@ public class SideloadSourceFactory {
private final SideloaderProcessing sideloaderProcessing; private final SideloaderProcessing sideloaderProcessing;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor documentKeywordExtractor; private final DocumentKeywordExtractor documentKeywordExtractor;
private final AnchorTextKeywords anchorTextKeywords;
private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final DirtreeSideloaderFactory dirtreeSideloaderFactory; private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
private final WarcSideloadFactory warcSideloadFactory; private final WarcSideloadFactory warcSideloadFactory;
@ -29,7 +31,7 @@ public class SideloadSourceFactory {
public SideloadSourceFactory(Gson gson, public SideloadSourceFactory(Gson gson,
SideloaderProcessing sideloaderProcessing, SideloaderProcessing sideloaderProcessing,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor documentKeywordExtractor, DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords,
AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTagsSourceFactory anchorTagsSourceFactory,
DirtreeSideloaderFactory dirtreeSideloaderFactory, DirtreeSideloaderFactory dirtreeSideloaderFactory,
WarcSideloadFactory warcSideloadFactory) { WarcSideloadFactory warcSideloadFactory) {
@ -37,13 +39,14 @@ public class SideloadSourceFactory {
this.sideloaderProcessing = sideloaderProcessing; this.sideloaderProcessing = sideloaderProcessing;
this.sentenceExtractorProvider = sentenceExtractorProvider; this.sentenceExtractorProvider = sentenceExtractorProvider;
this.documentKeywordExtractor = documentKeywordExtractor; this.documentKeywordExtractor = documentKeywordExtractor;
this.anchorTextKeywords = anchorTextKeywords;
this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory; this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
this.warcSideloadFactory = warcSideloadFactory; this.warcSideloadFactory = warcSideloadFactory;
} }
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException { public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing); return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing);
} }
public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException { public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException {

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.encyclopedia;
import com.github.luben.zstd.ZstdInputStream; import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson; import com.google.gson.Gson;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
@ -14,6 +15,7 @@ import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.util.ProcessingIterator;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -27,15 +29,10 @@ import java.nio.file.Path;
import java.sql.*; import java.sql.*;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicBoolean;
/** This is an experimental sideloader for encyclopedia.marginalia.nu's database; /** This is an experimental sideloader for encyclopedia.marginalia.nu's database;
* (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code) * (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code)
* * <p>
* See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data * See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data
*/ */
public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable { public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable {
@ -43,6 +40,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
private final Connection connection; private final Connection connection;
private final EdgeUrl baseUrl; private final EdgeUrl baseUrl;
private final Gson gson; private final Gson gson;
private final AnchorTextKeywords anchorTextKeywords;
private final SideloaderProcessing sideloaderProcessing; private final SideloaderProcessing sideloaderProcessing;
private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class); private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class);
@ -51,9 +49,11 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
String baseUrl, String baseUrl,
Gson gson, Gson gson,
AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
SideloaderProcessing sideloaderProcessing) throws SQLException { SideloaderProcessing sideloaderProcessing) throws SQLException {
this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new); this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new);
this.gson = gson; this.gson = gson;
this.anchorTextKeywords = anchorTextKeywords;
this.sideloaderProcessing = sideloaderProcessing; this.sideloaderProcessing = sideloaderProcessing;
String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString(); String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString();
@ -76,62 +76,24 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
@SneakyThrows @SneakyThrows
@Override @Override
public Iterator<ProcessedDocument> getDocumentsStream() { public Iterator<ProcessedDocument> getDocumentsStream() {
LinkedBlockingQueue<ProcessedDocument> docs = new LinkedBlockingQueue<>(32); return new ProcessingIterator<>(24, 16, (taskConsumer) -> {
AtomicBoolean isFinished = new AtomicBoolean(false); DomainLinks domainLinks = getDomainLinks();
ExecutorService executorService = Executors.newFixedThreadPool(16); var stmt = connection.prepareStatement("""
Semaphore sem = new Semaphore(16); SELECT url,title,html FROM articles
""");
stmt.setFetchSize(100);
DomainLinks domainLinks = getDomainLinks(); var rs = stmt.executeQuery();
executorService.submit(() -> { while (rs.next()) {
try { var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
var stmt = connection.prepareStatement(""" String title = rs.getString("title");
SELECT url,title,html FROM articles String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8);
""");
stmt.setFetchSize(100);
var rs = stmt.executeQuery(); taskConsumer.accept(() -> convertDocument(articleParts.parts, title, url, domainLinks));
while (rs.next()) {
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
String title = rs.getString("title");
String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8);
sem.acquire();
executorService.submit(() -> {
try {
docs.add(convertDocument(articleParts.parts, title, url, domainLinks));
} catch (URISyntaxException | DisqualifiedException e) {
e.printStackTrace();
} finally {
sem.release();
}
});
}
stmt.close();
}
catch (Exception e) {
e.printStackTrace();
}
finally {
isFinished.set(true);
} }
}); });
return new Iterator<>() {
@Override
public boolean hasNext() {
return !isFinished.get() || !docs.isEmpty() || sem.availablePermits() < 16;
}
@SneakyThrows
@Override
public ProcessedDocument next() {
return docs.take();
}
};
} }
private DomainLinks getDomainLinks() { private DomainLinks getDomainLinks() {
@ -142,30 +104,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
logger.error("Failed to create anchor tags source", ex); logger.error("Failed to create anchor tags source", ex);
return new DomainLinks(); return new DomainLinks();
} }
}
ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException {
var stmt = connection.prepareStatement("""
SELECT url,title,html
FROM articles
WHERE url=?
""");
stmt.setFetchSize(100);
stmt.setString(1, url);
var rs = stmt.executeQuery();
if (rs.next()) {
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
String title = rs.getString("title");
return convertDocument(articleParts.parts,
title,
URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8),
new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data.
);
}
return null;
} }
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException { private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
@ -180,13 +118,22 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
} }
fullHtml.append("</body></html>"); fullHtml.append("</body></html>");
return sideloaderProcessing var doc = sideloaderProcessing
.processDocument(fullUrl, .processDocument(fullUrl,
fullHtml.toString(), fullHtml.toString(),
List.of("encyclopedia", "wiki"), List.of("encyclopedia", "wiki"),
domainLinks, domainLinks,
GeneratorType.WIKI, GeneratorType.WIKI,
10_000_000); 10_000_000);
// Add anchor text keywords
if (doc.isProcessedFully()) {
doc.words.addAnchorTerms(
anchorTextKeywords.getAnchorTextKeywords(domainLinks, doc.url)
);
}
return doc;
} }
private <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException { private <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {

View File

@ -6,6 +6,7 @@ import lombok.SneakyThrows;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
@ -78,7 +79,7 @@ public class CrawlingThenConvertingIntegrationTest {
List<SerializableCrawlData> data = new ArrayList<>(); List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
} }
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();

View File

@ -1,38 +1,12 @@
package nu.marginalia.converting.sideload.encyclopedia; package nu.marginalia.converting.sideload.encyclopedia;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.ConverterModule;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.processor.ConverterDomainTypes;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.io.processed.DocumentRecordParquetFileReader;
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.processed.DocumentRecord;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.when;
class EncyclopediaMarginaliaNuSideloaderTest { class EncyclopediaMarginaliaNuSideloaderTest {
Path tempFile; Path tempFile;
@ -62,93 +36,5 @@ class EncyclopediaMarginaliaNuSideloaderTest {
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L))); System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L)));
System.out.printf("%64s\n", Long.toBinaryString(0x10L)); System.out.printf("%64s\n", Long.toBinaryString(0x10L));
} }
@Test
public void debugSpecificArticle() throws SQLException, IOException, URISyntaxException, DisqualifiedException {
Path pathToDbFile = Path.of("/home/vlofgren/Code/MarginaliaSearch/run/samples/articles.db");
if (!Files.exists(pathToDbFile)) {
// not really practical to ship a 40 Gb sqlite files on github
// be @vlofgren to run this test
return;
}
var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
var processing = Guice.createInjector(new ConverterModule(),
new AbstractModule() {
public void configure() {
bind(ConverterDomainTypes.class).toInstance(domainTypesMock);
}
}
)
.getInstance(SideloaderProcessing.class);
var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class);
when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks());
var sideloader = new EncyclopediaMarginaliaNuSideloader(
pathToDbFile,
"https://en.wikipedia.org/wiki/",
GsonFactory.get(),
atagsFactory,
processing
);
var document = sideloader.processJust("Don't_Tell_Me_(Madonna_song)");
System.out.println(document);
var keywordsBuilt = document.words.build();
var ptr = keywordsBuilt.newPointer();
Map<String, WordMetadata> dirtyAndBlues = new HashMap<>();
while (ptr.advancePointer()) {
String word = ptr.getKeyword();
System.out.println(word + ": " + Long.toHexString(Long.reverseBytes(ptr.getMetadata())));
if (Set.of("dirty", "blues").contains(word)) {
WordMetadata meta = new WordMetadata(ptr.getMetadata());
Assertions.assertNull(
dirtyAndBlues.put(word, meta)
);
}
}
Assertions.assertTrue(dirtyAndBlues.containsKey("dirty"));
Assertions.assertTrue(dirtyAndBlues.containsKey("blues"));
Assertions.assertNotEquals(
dirtyAndBlues.get("dirty"),
dirtyAndBlues.get("blues")
);
try (var dw = new DocumentRecordParquetFileWriter(tempFile)) {
dw.write(new DocumentRecord(
"encyclopedia.marginalia.nu",
document.url.toString(),
0,
document.state.toString(),
document.stateReason,
document.details.title,
document.details.description,
HtmlFeature.encode(document.details.features),
document.details.standard.name(),
document.details.length,
document.details.hashCode,
(float) document.details.quality,
document.details.metadata.encode(),
document.details.pubYear,
List.of(keywordsBuilt.keywords),
new TLongArrayList(keywordsBuilt.metadata)
));
}
var record = DocumentRecordParquetFileReader.streamKeywordsProjection(tempFile).findFirst().get();
String[] words = record.words.toArray(String[]::new);
long[] meta = record.metas.toArray();
assertArrayEquals(keywordsBuilt.keywords, words);
assertArrayEquals(keywordsBuilt.metadata, meta);
}
} }

View File

@ -0,0 +1,38 @@
package nu.marginalia.util;
import org.junit.jupiter.api.Test;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class ProcessingIteratorTest {
@Test
public void test() {
Set<Integer> output = new HashSet<>();
var iter = new ProcessingIterator<Integer>(2, 2, q -> {
for (int i = 0; i < 10_000; i++) {
int j = i;
q.accept(() -> task(j));
}
});
while (iter.hasNext()) {
output.add(iter.next());
}
assertEquals(10_000, output.size());
for (int i = 0; i < 10_000; i++) {
assertTrue(output.contains(i));
}
}
int task(int n) throws InterruptedException {
TimeUnit.NANOSECONDS.sleep(10);
return n;
}
}

View File

@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.crawl.spec.CrawlSpecProvider;
@ -51,6 +52,7 @@ public class CrawlerMain {
private final ProcessHeartbeatImpl heartbeat; private final ProcessHeartbeatImpl heartbeat;
private final MessageQueueFactory messageQueueFactory; private final MessageQueueFactory messageQueueFactory;
private final DomainProber domainProber;
private final FileStorageService fileStorageService; private final FileStorageService fileStorageService;
private final DbCrawlSpecProvider dbCrawlSpecProvider; private final DbCrawlSpecProvider dbCrawlSpecProvider;
private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final AnchorTagsSourceFactory anchorTagsSourceFactory;
@ -70,7 +72,7 @@ public class CrawlerMain {
@Inject @Inject
public CrawlerMain(UserAgent userAgent, public CrawlerMain(UserAgent userAgent,
ProcessHeartbeatImpl heartbeat, ProcessHeartbeatImpl heartbeat,
MessageQueueFactory messageQueueFactory, MessageQueueFactory messageQueueFactory, DomainProber domainProber,
FileStorageService fileStorageService, FileStorageService fileStorageService,
ProcessConfiguration processConfiguration, ProcessConfiguration processConfiguration,
DbCrawlSpecProvider dbCrawlSpecProvider, DbCrawlSpecProvider dbCrawlSpecProvider,
@ -78,6 +80,7 @@ public class CrawlerMain {
Gson gson) { Gson gson) {
this.heartbeat = heartbeat; this.heartbeat = heartbeat;
this.messageQueueFactory = messageQueueFactory; this.messageQueueFactory = messageQueueFactory;
this.domainProber = domainProber;
this.fileStorageService = fileStorageService; this.fileStorageService = fileStorageService;
this.dbCrawlSpecProvider = dbCrawlSpecProvider; this.dbCrawlSpecProvider = dbCrawlSpecProvider;
this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.anchorTagsSourceFactory = anchorTagsSourceFactory;
@ -211,14 +214,13 @@ public class CrawlerMain {
try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id); try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id);
var warcRecorder = new WarcRecorder(); // write to a temp file for now var warcRecorder = new WarcRecorder(); // write to a temp file for now
var retreiver = new CrawlerRetreiver(fetcher, specification, warcRecorder, writer::accept); var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder, writer::accept);
CrawlDataReference reference = getReference()) CrawlDataReference reference = getReference())
{ {
Thread.currentThread().setName("crawling:" + domain); Thread.currentThread().setName("crawling:" + domain);
var domainLinks = anchorTagsSource.getAnchorTags(domain); var domainLinks = anchorTagsSource.getAnchorTags(domain);
int size = retreiver.fetch(domainLinks, reference); int size = retreiver.fetch(domainLinks, reference);
workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size); workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);

View File

@ -45,7 +45,7 @@ public class CrawlerRetreiver implements AutoCloseable {
private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
private static final DomainProber domainProber = new DomainProber(); private final DomainProber domainProber;
private final SitemapRetriever sitemapRetriever; private final SitemapRetriever sitemapRetriever;
private final DomainCrawlFrontier crawlFrontier; private final DomainCrawlFrontier crawlFrontier;
private final WarcRecorder warcRecorder; private final WarcRecorder warcRecorder;
@ -59,12 +59,14 @@ public class CrawlerRetreiver implements AutoCloseable {
private static final String documentWasSameTag = "SAME-BY-COMPARISON"; private static final String documentWasSameTag = "SAME-BY-COMPARISON";
public CrawlerRetreiver(HttpFetcher fetcher, public CrawlerRetreiver(HttpFetcher fetcher,
DomainProber domainProber,
CrawlSpecRecord specs, CrawlSpecRecord specs,
WarcRecorder warcRecorder, WarcRecorder warcRecorder,
Consumer<SerializableCrawlData> writer) Consumer<SerializableCrawlData> writer)
{ {
this.warcRecorder = warcRecorder; this.warcRecorder = warcRecorder;
this.fetcher = fetcher; this.fetcher = fetcher;
this.domainProber = domainProber;
domain = specs.domain; domain = specs.domain;

View File

@ -1,5 +1,7 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.retreival;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState; import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawling.model.CrawlerDomainStatus; import nu.marginalia.crawling.model.CrawlerDomainStatus;
@ -11,17 +13,21 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.util.function.Predicate;
@Singleton
public class DomainProber { public class DomainProber {
private final Logger logger = LoggerFactory.getLogger(DomainProber.class); private final Logger logger = LoggerFactory.getLogger(DomainProber.class);
private static IpBlockList ipBlockList; private final Predicate<EdgeDomain> domainBlacklist;
static { @Inject
try { public DomainProber(IpBlockList ipBlockList) {
ipBlockList = new IpBlockList(new GeoIpBlocklist()); this.domainBlacklist = ipBlockList::isAllowed;
} catch (Exception e) { }
throw new RuntimeException(e);
} /** For testing */
public DomainProber(Predicate<EdgeDomain> domainBlacklist) {
this.domainBlacklist = domainBlacklist;
} }
/** To detect problems early we do a probing request to the domain before we start crawling it properly. /** To detect problems early we do a probing request to the domain before we start crawling it properly.
@ -37,7 +43,7 @@ public class DomainProber {
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs"); return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
} }
if (!ipBlockList.isAllowed(firstUrlInQueue.domain)) if (!domainBlacklist.test(firstUrlInQueue.domain))
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed"); return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null)); var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
@ -62,7 +68,7 @@ public class DomainProber {
/** This domain redirects to another domain */ /** This domain redirects to another domain */
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {} public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
/** If the retreivala of the probed url was successful, return the url as it was fetched /** If the retrieval of the probed url was successful, return the url as it was fetched
* (which may be different from the url we probed, if we attempted another URL schema). * (which may be different from the url we probed, if we attempted another URL schema).
* *
* @param probedUrl The url we successfully probed * @param probedUrl The url we successfully probed

View File

@ -90,7 +90,10 @@ public class HttpFetcherImpl implements HttpFetcher {
} }
@Inject @Inject
public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) { public HttpFetcherImpl(@Named("user-agent") String userAgent,
Dispatcher dispatcher,
ConnectionPool connectionPool)
{
this.client = createClient(dispatcher, connectionPool); this.client = createClient(dispatcher, connectionPool);
this.userAgent = userAgent; this.userAgent = userAgent;
this.contentTypeProber = new ContentTypeProber(userAgent, client); this.contentTypeProber = new ContentTypeProber(userAgent, client);

View File

@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.*; import nu.marginalia.crawl.retreival.fetcher.*;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
@ -65,7 +66,7 @@ public class CrawlerMockFetcherTest {
void crawl(CrawlSpecRecord spec, Consumer<SerializableCrawlData> consumer) throws IOException { void crawl(CrawlSpecRecord spec, Consumer<SerializableCrawlData> consumer) throws IOException {
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(fetcherMock, spec, recorder, consumer) new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder, consumer)
.fetch(); .fetch();
} }
} }

View File

@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
@ -59,7 +60,7 @@ class CrawlerRetreiverTest {
List<SerializableCrawlData> data = new ArrayList<>(); List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder(tempFile)) { try (var recorder = new WarcRecorder(tempFile)) {
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
} catch (IOException ex) { } catch (IOException ex) {
Assertions.fail(ex); Assertions.fail(ex);
} }
@ -103,7 +104,7 @@ class CrawlerRetreiverTest {
List<SerializableCrawlData> data = new ArrayList<>(); List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
} }
catch (IOException ex) { catch (IOException ex) {
Assertions.fail(ex); Assertions.fail(ex);
@ -137,7 +138,7 @@ class CrawlerRetreiverTest {
List<SerializableCrawlData> data = new ArrayList<>(); List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
} }
catch (IOException ex) { catch (IOException ex) {
Assertions.fail(ex); Assertions.fail(ex);
@ -178,7 +179,7 @@ class CrawlerRetreiverTest {
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>(); Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, d -> { new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> {
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
if (d instanceof CrawledDocument doc) { if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
@ -202,7 +203,7 @@ class CrawlerRetreiverTest {
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, d -> { new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> {
if (d instanceof CrawledDocument doc) { if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
} }

View File

@ -9,6 +9,7 @@ import nu.marginalia.io.processed.DomainRecordParquetFileReader;
import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.LoaderInputData;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.processed.DomainRecord; import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.model.processed.DomainWithIp;
import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.control.ProcessHeartbeatImpl;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -51,9 +52,9 @@ public class DomainLoaderService {
) { ) {
try (var inserter = new DomainInserter(conn, nodeId)) { try (var inserter = new DomainInserter(conn, nodeId)) {
for (var domain : readSetDomainNames(inputData)) { for (var domainWithIp : readBasicDomainInformation(inputData)) {
inserter.accept(new EdgeDomain(domain)); inserter.accept(new EdgeDomain(domainWithIp.domain));
domainNamesAll.add(domain); domainNamesAll.add(domainWithIp.domain);
} }
} }
try (var inserter = new DomainInserter(conn, -1)) { try (var inserter = new DomainInserter(conn, -1)) {
@ -63,9 +64,9 @@ public class DomainLoaderService {
} }
} }
try (var updater = new DomainAffinityUpdater(conn, nodeId)) { try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId)) {
for (var domain : readSetDomainNames(inputData)) { for (var domainWithIp : readBasicDomainInformation(inputData)) {
updater.accept(new EdgeDomain(domain)); updater.accept(new EdgeDomain(domainWithIp.domain), domainWithIp.ip);
} }
} }
@ -84,15 +85,15 @@ public class DomainLoaderService {
return ret; return ret;
} }
Collection<String> readSetDomainNames(LoaderInputData inputData) throws IOException { Collection<DomainWithIp> readBasicDomainInformation(LoaderInputData inputData) throws IOException {
final Set<String> domainNamesAll = new HashSet<>(100_000); final Set<DomainWithIp> domainsAll = new HashSet<>(100_000);
var domainFiles = inputData.listDomainFiles(); var domainFiles = inputData.listDomainFiles();
for (var file : domainFiles) { for (var file : domainFiles) {
domainNamesAll.addAll(DomainRecordParquetFileReader.getDomainNames(file)); domainsAll.addAll(DomainRecordParquetFileReader.getBasicDomainInformation(file));
} }
return domainNamesAll; return domainsAll;
} }
Collection<String> readReferencedDomainNames(LoaderInputData inputData) throws IOException { Collection<String> readReferencedDomainNames(LoaderInputData inputData) throws IOException {
@ -164,20 +165,25 @@ public class DomainLoaderService {
statement.close(); statement.close();
} }
} }
private static class DomainAffinityUpdater implements AutoCloseable { private static class DomainAffinityAndIpUpdater implements AutoCloseable {
private final PreparedStatement statement; private final PreparedStatement statement;
private final int nodeAffinity; private final int nodeAffinity;
private int count = 0; private int count = 0;
public DomainAffinityUpdater(Connection connection, int affinity) throws SQLException { public DomainAffinityAndIpUpdater(Connection connection, int affinity) throws SQLException {
this.nodeAffinity = affinity; this.nodeAffinity = affinity;
statement = connection.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE DOMAIN_NAME=?"); statement = connection.prepareStatement("""
UPDATE EC_DOMAIN
SET NODE_AFFINITY = ?, IP = ?
WHERE DOMAIN_NAME=?
""");
} }
public void accept(EdgeDomain domain) throws SQLException { public void accept(EdgeDomain domain, String ip) throws SQLException {
statement.setInt(1, nodeAffinity); statement.setInt(1, nodeAffinity);
statement.setString(2, domain.toString()); statement.setString(2, ip);
statement.setString(3, domain.toString());
statement.addBatch(); statement.addBatch();
if (++count > 1000) { if (++count > 1000) {

View File

@ -6,7 +6,6 @@ import nu.marginalia.ProcessConfiguration;
import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter;
import nu.marginalia.io.processed.DomainRecordParquetFileWriter; import nu.marginalia.io.processed.DomainRecordParquetFileWriter;
import nu.marginalia.io.processed.ProcessedDataFileNames; import nu.marginalia.io.processed.ProcessedDataFileNames;
import nu.marginalia.loader.DbTestUtil;
import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.LoaderInputData;
import nu.marginalia.model.processed.DomainLinkRecord; import nu.marginalia.model.processed.DomainLinkRecord;
import nu.marginalia.model.processed.DomainRecord; import nu.marginalia.model.processed.DomainRecord;
@ -21,10 +20,8 @@ import org.testcontainers.junit.jupiter.Testcontainers;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
@ -99,7 +96,7 @@ class DomainLoaderServiceTest {
// Verify // Verify
Set<String> expectedDomains1 = Sets.union(new HashSet<>(domains1), new HashSet<>(domains2)); Set<String> expectedDomains1 = Sets.union(new HashSet<>(domains1), new HashSet<>(domains2));
assertEquals(expectedDomains1, domainService.readSetDomainNames(new LoaderInputData(workDir, 2))); assertEquals(expectedDomains1, domainService.readBasicDomainInformation(new LoaderInputData(workDir, 2)).stream().map(d -> d.domain).collect(Collectors.toSet()));
Set<String> expectedDomains2 = new HashSet<>(linkDomains); Set<String> expectedDomains2 = new HashSet<>(linkDomains);
assertEquals(expectedDomains2, domainService.readReferencedDomainNames(new LoaderInputData(workDir, 2))); assertEquals(expectedDomains2, domainService.readReferencedDomainNames(new LoaderInputData(workDir, 2)));

View File

@ -10,8 +10,6 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem; import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore; import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.query.client.QueryClient; import nu.marginalia.query.client.QueryClient;
import nu.marginalia.query.model.QueryParams; import nu.marginalia.query.model.QueryParams;
@ -64,7 +62,7 @@ public class ApiSearchOperator {
return switch (index) { return switch (index) {
case 0 -> SearchSetIdentifier.NONE; case 0 -> SearchSetIdentifier.NONE;
case 1 -> SearchSetIdentifier.SMALLWEB; case 1 -> SearchSetIdentifier.SMALLWEB;
case 2 -> SearchSetIdentifier.RETRO; case 2 -> SearchSetIdentifier.POPULAR;
case 3 -> SearchSetIdentifier.NONE; case 3 -> SearchSetIdentifier.NONE;
case 5 -> SearchSetIdentifier.NONE; case 5 -> SearchSetIdentifier.NONE;
default -> SearchSetIdentifier.NONE; default -> SearchSetIdentifier.NONE;

View File

@ -31,7 +31,6 @@ public class SearchService extends Service {
SearchFrontPageService frontPageService, SearchFrontPageService frontPageService,
SearchErrorPageService errorPageService, SearchErrorPageService errorPageService,
SearchAddToCrawlQueueService addToCrawlQueueService, SearchAddToCrawlQueueService addToCrawlQueueService,
SearchFlagSiteService flagSiteService,
SearchSiteInfoService siteInfoService, SearchSiteInfoService siteInfoService,
SearchQueryService searchQueryService SearchQueryService searchQueryService
) { ) {

View File

@ -1,28 +0,0 @@
package nu.marginalia.search.model;
import lombok.*;
import nu.marginalia.model.EdgeDomain;
import java.util.List;
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
@ToString
public class DomainInformation {
EdgeDomain domain;
boolean blacklisted;
int pagesKnown;
int pagesFetched;
int pagesIndexed;
int incomingLinks;
int outboundLinks;
int nodeAffinity;
double ranking;
boolean suggestForCrawling;
boolean inCrawlQueue;
boolean unknownDomain;
String state;
List<EdgeDomain> linkingDomains;
}

View File

@ -5,7 +5,6 @@ import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.command.SearchAdtechParameter; import nu.marginalia.search.command.SearchAdtechParameter;
import nu.marginalia.search.command.SearchJsParameter; import nu.marginalia.search.command.SearchJsParameter;
import nu.marginalia.search.command.SearchParameters; import nu.marginalia.search.command.SearchParameters;
import org.apache.regexp.RE;
import java.util.List; import java.util.List;
@ -37,7 +36,7 @@ public class SearchFilters {
filterGroups = List.of( filterGroups = List.of(
List.of( List.of(
new Filter("No Filter", SearchProfile.NO_FILTER, parameters), new Filter("No Filter", SearchProfile.NO_FILTER, parameters),
new Filter("Popular", SearchProfile.DEFAULT, parameters), new Filter("Popular", SearchProfile.POPULAR, parameters),
new Filter("Small Web", SearchProfile.SMALLWEB, parameters), new Filter("Small Web", SearchProfile.SMALLWEB, parameters),
new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters), new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters),
new Filter("Academia", SearchProfile.ACADEMIA, parameters) new Filter("Academia", SearchProfile.ACADEMIA, parameters)

View File

@ -8,19 +8,16 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import java.util.Objects; import java.util.Objects;
public enum SearchProfile { public enum SearchProfile {
DEFAULT("default", SearchSetIdentifier.RETRO), POPULAR("default", SearchSetIdentifier.POPULAR),
SMALLWEB("modern", SearchSetIdentifier.SMALLWEB), SMALLWEB("modern", SearchSetIdentifier.SMALLWEB),
BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS), BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS),
NO_FILTER("corpo", SearchSetIdentifier.NONE), NO_FILTER("corpo", SearchSetIdentifier.NONE),
YOLO("yolo", SearchSetIdentifier.NONE),
VINTAGE("vintage", SearchSetIdentifier.NONE), VINTAGE("vintage", SearchSetIdentifier.NONE),
TILDE("tilde", SearchSetIdentifier.NONE), TILDE("tilde", SearchSetIdentifier.NONE),
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE), CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
ACADEMIA("academia", SearchSetIdentifier.ACADEMIA), ACADEMIA("academia", SearchSetIdentifier.NONE),
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE), PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
FOOD("food", SearchSetIdentifier.NONE), FOOD("food", SearchSetIdentifier.POPULAR),
CRAFTS("crafts", SearchSetIdentifier.NONE),
CLASSICS("classics", SearchSetIdentifier.NONE),
FORUM("forum", SearchSetIdentifier.NONE), FORUM("forum", SearchSetIdentifier.NONE),
WIKI("wiki", SearchSetIdentifier.NONE), WIKI("wiki", SearchSetIdentifier.NONE),
DOCS("docs", SearchSetIdentifier.NONE), DOCS("docs", SearchSetIdentifier.NONE),
@ -38,7 +35,7 @@ public enum SearchProfile {
private final static SearchProfile[] values = values(); private final static SearchProfile[] values = values();
public static SearchProfile getSearchProfile(String param) { public static SearchProfile getSearchProfile(String param) {
if (null == param) { if (null == param) {
return NO_FILTER; return POPULAR;
} }
for (var profile : values) { for (var profile : values) {
@ -47,12 +44,12 @@ public enum SearchProfile {
} }
} }
return NO_FILTER; return POPULAR;
} }
public void addTacitTerms(SearchSubquery subquery) { public void addTacitTerms(SearchSubquery subquery) {
if (this == ACADEMIA) { if (this == ACADEMIA) {
subquery.searchTermsPriority.add("tld:edu"); subquery.searchTermsAdvice.add("special:academia");
} }
if (this == VINTAGE) { if (this == VINTAGE) {
subquery.searchTermsPriority.add("format:html123"); subquery.searchTermsPriority.add("format:html123");
@ -75,9 +72,7 @@ public enum SearchProfile {
} }
if (this == FOOD) { if (this == FOOD) {
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
} subquery.searchTermsExclude.add("special:ads");
if (this == CRAFTS) {
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword());
} }
} }
@ -106,13 +101,5 @@ public enum SearchProfile {
else return SpecificationLimit.none(); else return SpecificationLimit.none();
} }
public String getNearDomain() {
if (this == CLASSICS) {
return "classics.mit.edu";
}
return null;
}
} }

View File

@ -1,276 +0,0 @@
package nu.marginalia.search.siteinfo;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.search.model.DomainInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.sql.SQLException;
import java.util.*;
/*
TODO: This class needs to be refactored, a lot of
these SQL queries are redundant and can be
collapsed into one single query that fetches
all the information
*/
@Singleton
public class DomainInformationService {
private DbDomainQueries dbDomainQueries;
private HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public DomainInformationService(
DbDomainQueries dbDomainQueries,
HikariDataSource dataSource) {
this.dbDomainQueries = dbDomainQueries;
this.dataSource = dataSource;
}
public Optional<DomainInformation> domainInfo(String site) {
OptionalInt maybeDomainId = getDomainFromPartial(site);
if (maybeDomainId.isEmpty()) {
return Optional.empty();
}
int domainId = maybeDomainId.getAsInt();
Optional<EdgeDomain> domain = dbDomainQueries.getDomain(domainId);
if (domain.isEmpty()) {
return Optional.empty();
}
boolean blacklisted = isBlacklisted(domain.get());
int pagesKnown = getPagesKnown(domainId);
int pagesVisited = getPagesVisited(domainId);
int pagesIndexed = getPagesIndexed(domainId);
int incomingLinks = getIncomingLinks(domainId);
int outboundLinks = getOutboundLinks(domainId);
int nodeAffinity = getNodeAffinity(domainId);
boolean inCrawlQueue = inCrawlQueue(domainId);
double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100;
DomainIndexingState state = getDomainState(domainId);
List<EdgeDomain> linkingDomains = getLinkingDomains(domainId);
var di = DomainInformation.builder()
.domain(domain.get())
.blacklisted(blacklisted)
.pagesKnown(pagesKnown)
.pagesFetched(pagesVisited)
.pagesIndexed(pagesIndexed)
.incomingLinks(incomingLinks)
.outboundLinks(outboundLinks)
.ranking(rank)
.state(state.desc)
.linkingDomains(linkingDomains)
.inCrawlQueue(inCrawlQueue)
.nodeAffinity(nodeAffinity)
.suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue))
.build();
return Optional.of(di);
}
private int getNodeAffinity(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("""
SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE ID=?
""")) {
stmt.setInt(1, domainId);
var rs = stmt.executeQuery();
if (rs.next())
return rs.getInt(1);
}
}
catch (SQLException ex) {
logger.error("SQL error", ex);
}
return -1;
}
@SneakyThrows
private boolean inCrawlQueue(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
SELECT 1 FROM CRAWL_QUEUE
INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
WHERE EC_DOMAIN.ID=?
"""))
{
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
return rsp.next();
}
}
}
private OptionalInt getDomainFromPartial(String site) {
return dbDomainQueries.tryGetDomainId(new EdgeDomain(site));
}
@SneakyThrows
public boolean isBlacklisted(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN IN (?,?)")) {
stmt.setString(1, domain.domain);
stmt.setString(2, domain.toString());
var rsp = stmt.executeQuery();
return rsp.next();
}
}
}
@SneakyThrows
public int getPagesKnown(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getPagesVisited(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getPagesIndexed(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getIncomingLinks(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getOutboundLinks(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
public DomainIndexingState getDomainState(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return DomainIndexingState.valueOf(rsp.getString(1));
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return DomainIndexingState.ERROR;
}
public List<EdgeDomain> getLinkingDomains(int domainId) {
try (var connection = dataSource.getConnection()) {
List<EdgeDomain> results = new ArrayList<>(25);
try (var stmt = connection.prepareStatement("SELECT SOURCE_DOMAIN FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeDomain(rsp.getString(1)));
}
return results;
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return Collections.emptyList();
}
public double getRank(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return 1;
}
}

View File

@ -1,16 +1,16 @@
package nu.marginalia.search.svc; package nu.marginalia.search.svc;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.assistant.client.AssistantClient;
import nu.marginalia.assistant.client.model.SimilarDomain;
import nu.marginalia.client.Context; import nu.marginalia.client.Context;
import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory; import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.SearchOperator; import nu.marginalia.search.SearchOperator;
import nu.marginalia.search.model.DomainInformation; import nu.marginalia.assistant.client.model.DomainInformation;
import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.siteinfo.DomainInformationService;
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData; import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
import spark.Request; import spark.Request;
import spark.Response; import spark.Response;
@ -23,22 +23,19 @@ import java.util.Map;
public class SearchSiteInfoService { public class SearchSiteInfoService {
private final SearchOperator searchOperator; private final SearchOperator searchOperator;
private final SimilarDomainsService similarDomains; private final AssistantClient assistantClient;
private final DomainInformationService domainInformationService;
private final SearchFlagSiteService flagSiteService; private final SearchFlagSiteService flagSiteService;
private final DbDomainQueries domainQueries; private final DbDomainQueries domainQueries;
private final MustacheRenderer<Object> renderer; private final MustacheRenderer<Object> renderer;
@Inject @Inject
public SearchSiteInfoService(SearchOperator searchOperator, public SearchSiteInfoService(SearchOperator searchOperator,
SimilarDomainsService similarDomains, AssistantClient assistantClient,
DomainInformationService domainInformationService,
RendererFactory rendererFactory, RendererFactory rendererFactory,
SearchFlagSiteService flagSiteService, SearchFlagSiteService flagSiteService,
DbDomainQueries domainQueries) throws IOException { DbDomainQueries domainQueries) throws IOException {
this.searchOperator = searchOperator; this.searchOperator = searchOperator;
this.similarDomains = similarDomains; this.assistantClient = assistantClient;
this.domainInformationService = domainInformationService;
this.flagSiteService = flagSiteService; this.flagSiteService = flagSiteService;
this.domainQueries = domainQueries; this.domainQueries = domainQueries;
@ -108,13 +105,6 @@ public class SearchSiteInfoService {
false); false);
} }
private DomainInformation dummyInformation(String domainName) {
return DomainInformation.builder()
.domain(new EdgeDomain(domainName))
.suggestForCrawling(true)
.unknownDomain(true)
.build();
}
private Backlinks listLinks(Context ctx, String domainName) { private Backlinks listLinks(Context ctx, String domainName) {
return new Backlinks(domainName, return new Backlinks(domainName,
@ -126,13 +116,24 @@ public class SearchSiteInfoService {
final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1); final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
final DomainInformation domainInfo = domainInformationService.domainInfo(domainName) final DomainInformation domainInfo;
.orElseGet(() -> dummyInformation(domainName)); final List<SimilarDomain> similarSet;
final List<SimilarDomain> linkingDomains;
final List<SimilarDomainsService.SimilarDomain> similarSet = if (domainId < 0 || !assistantClient.isAccepting()) {
similarDomains.getSimilarDomains(domainId, 100); domainInfo = createDummySiteInfo(domainName);
final List<SimilarDomainsService.SimilarDomain> linkingDomains = similarSet = List.of();
similarDomains.getLinkingDomains(domainId, 100); linkingDomains = List.of();
}
else {
domainInfo = assistantClient.domainInformation(ctx, domainId).blockingFirst();
similarSet = assistantClient
.similarDomains(ctx, domainId, 100)
.blockingFirst();
linkingDomains = assistantClient
.linkedDomains(ctx, domainId, 100)
.blockingFirst();
}
return new SiteInfoWithContext(domainName, return new SiteInfoWithContext(domainName,
domainId, domainId,
@ -141,6 +142,15 @@ public class SearchSiteInfoService {
linkingDomains linkingDomains
); );
} }
private DomainInformation createDummySiteInfo(String domainName) {
return DomainInformation.builder()
.domain(new EdgeDomain(domainName))
.suggestForCrawling(true)
.unknownDomain(true)
.build();
}
private Docs listDocs(Context ctx, String domainName) { private Docs listDocs(Context ctx, String domainName) {
return new Docs(domainName, return new Docs(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
@ -181,13 +191,13 @@ public class SearchSiteInfoService {
String domain, String domain,
long domainId, long domainId,
DomainInformation domainInformation, DomainInformation domainInformation,
List<SimilarDomainsService.SimilarDomain> similar, List<SimilarDomain> similar,
List<SimilarDomainsService.SimilarDomain> linking) { List<SimilarDomain> linking) {
public SiteInfoWithContext(String domain, public SiteInfoWithContext(String domain,
long domainId, long domainId,
DomainInformation domainInformation, DomainInformation domainInformation,
List<SimilarDomainsService.SimilarDomain> similar, List<SimilarDomain> similar,
List<SimilarDomainsService.SimilarDomain> linking List<SimilarDomain> linking
) )
{ {
this(Map.of("info", true), this(Map.of("info", true),

View File

@ -99,7 +99,6 @@
</section> </section>
<section id="legal"> <section id="legal">
<h1>Policies</h1> <h1>Policies</h1>
This website complies with the GDPR by <em>not collecting any personal This website complies with the GDPR by <em>not collecting any personal
information</em>, and with the EU Cookie Directive by <em>not using information</em>, and with the EU Cookie Directive by <em>not using
cookies</em>. <a href="https://memex.marginalia.nu/projects/edge/privacy.gmi">More Information</a>. cookies</em>. <a href="https://memex.marginalia.nu/projects/edge/privacy.gmi">More Information</a>.
@ -109,8 +108,13 @@
<h1> Open Source </h1> <h1> Open Source </h1>
The search engine is open source with an AGPL license. The sources can be perused at The search engine is open source with an AGPL license. The sources can be perused at
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>. <tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
<h1>Data Sources</h1>
IP geolocation is sourced from the IP2Location LITE data available from
<a rel="external noopener nofollow" href="https://lite.ip2location.com/">https://lite.ip2location.com/</a>
under
<a rel="external noopener nofollow" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA&nbsp;4.0</a>.
</section> </section>
</footer> </footer>
<script src="/tts.js"></script> <script src="/tts.js"></script>

View File

@ -6,5 +6,6 @@
Pages Known: {{pagesKnown}} <br/> Pages Known: {{pagesKnown}} <br/>
Pages Crawled: {{pagesFetched}} <br/> Pages Crawled: {{pagesFetched}} <br/>
Pages Indexed: {{pagesIndexed}} <br/> Pages Indexed: {{pagesIndexed}} <br/>
IP: {{ip}} {{#if ipCountry}}<span title="{{ipCountry}}">{{getIpFlag}}</span>{{/if}}<br/>
</fieldset> </fieldset>
<br/> <br/>

View File

@ -27,11 +27,13 @@ dependencies {
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:common:service') implementation project(':code:common:service')
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:common:service-discovery') implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client') implementation project(':code:common:service-client')
implementation project(':code:features-search:screenshots') implementation project(':code:features-search:screenshots')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict') implementation project(':code:libraries:term-frequency-dict')

View File

@ -3,6 +3,8 @@ package nu.marginalia.assistant;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.assistant.domains.DomainInformationService;
import nu.marginalia.assistant.domains.SimilarDomainsService;
import nu.marginalia.assistant.eval.Units; import nu.marginalia.assistant.eval.Units;
import nu.marginalia.assistant.suggest.Suggestions; import nu.marginalia.assistant.suggest.Suggestions;
import nu.marginalia.assistant.eval.MathParser; import nu.marginalia.assistant.eval.MathParser;
@ -16,11 +18,16 @@ import spark.Request;
import spark.Response; import spark.Response;
import spark.Spark; import spark.Spark;
import java.util.ArrayList;
import java.util.Objects;
public class AssistantService extends Service { public class AssistantService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = GsonFactory.get(); private final Gson gson = GsonFactory.get();
private final Units units; private final Units units;
private final MathParser mathParser; private final MathParser mathParser;
private final SimilarDomainsService similarDomainsService;
private final DomainInformationService domainInformationService;
private final Suggestions suggestions; private final Suggestions suggestions;
@SneakyThrows @SneakyThrows
@ -30,12 +37,16 @@ public class AssistantService extends Service {
MathParser mathParser, MathParser mathParser,
Units units, Units units,
ScreenshotService screenshotService, ScreenshotService screenshotService,
SimilarDomainsService similarDomainsService,
DomainInformationService domainInformationService,
Suggestions suggestions) Suggestions suggestions)
{ {
super(params); super(params);
this.mathParser = mathParser; this.mathParser = mathParser;
this.units = units; this.units = units;
this.similarDomainsService = similarDomainsService;
this.domainInformationService = domainInformationService;
this.suggestions = suggestions; this.suggestions = suggestions;
Spark.staticFiles.expireTime(600); Spark.staticFiles.expireTime(600);
@ -56,12 +67,50 @@ public class AssistantService extends Service {
rsp, rsp,
req.queryParams("value") req.queryParams("value")
)); ));
Spark.get("/domain/:id/similar", this::getSimilarDomains, this::convertToJson);
Spark.get("/domain/:id/linking", this::getLinkingDomains, this::convertToJson);
Spark.get("/domain/:id/info", this::getDomainInformation, this::convertToJson);
Spark.get("/public/suggest/", this::getSuggestions, this::convertToJson); Spark.get("/public/suggest/", this::getSuggestions, this::convertToJson);
Spark.awaitInitialization(); Spark.awaitInitialization();
} }
private Object getSimilarDomains(Request request, Response response) {
int domainId = Integer.parseInt(request.params("id"));
int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25"));
response.type("application/json");
if (!similarDomainsService.isReady()) {
return new ArrayList<>();
}
return similarDomainsService.getSimilarDomains(domainId, count);
}
private Object getLinkingDomains(Request request, Response response) {
int domainId = Integer.parseInt(request.params("id"));
int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25"));
response.type("application/json");
if (!similarDomainsService.isReady()) {
return new ArrayList<>();
}
return similarDomainsService.getLinkingDomains(domainId, count);
}
private Object getDomainInformation(Request request, Response response) {
int domainId = Integer.parseInt(request.params("id"));
response.type("application/json");
var maybeDomainInfo = domainInformationService.domainInfo(domainId);
if (maybeDomainInfo.isEmpty()) {
Spark.halt(404);
}
return maybeDomainInfo.get();
}
private Object getSuggestions(Request request, Response response) { private Object getSuggestions(Request request, Response response) {
response.type("application/json"); response.type("application/json");
var param = request.queryParams("partial"); var param = request.queryParams("partial");

View File

@ -0,0 +1,115 @@
package nu.marginalia.assistant.domains;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.assistant.client.model.DomainInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
@Singleton
public class DomainInformationService {
private final GeoIpDictionary geoIpDictionary;
private DbDomainQueries dbDomainQueries;
private HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public DomainInformationService(
DbDomainQueries dbDomainQueries,
GeoIpDictionary geoIpDictionary,
HikariDataSource dataSource) {
this.dbDomainQueries = dbDomainQueries;
this.geoIpDictionary = geoIpDictionary;
this.dataSource = dataSource;
}
public Optional<DomainInformation> domainInfo(int domainId) {
Optional<EdgeDomain> domain = dbDomainQueries.getDomain(domainId);
if (domain.isEmpty()) {
return Optional.empty();
}
var builder = DomainInformation.builder();
try (var connection = dataSource.getConnection();
var stmt = connection.createStatement();
) {
boolean inCrawlQueue;
int outboundLinks = 0;
int pagesVisited = 0;
ResultSet rs;
rs = stmt.executeQuery(STR."""
SELECT IP, NODE_AFFINITY, DOMAIN_NAME, STATE, IFNULL(RANK, 1) AS RANK
FROM EC_DOMAIN WHERE ID=\{domainId}
""");
if (rs.next()) {
String ip = rs.getString("IP");
builder.ip(ip);
builder.ipCountry(geoIpDictionary.getCountry(ip));
builder.nodeAffinity(rs.getInt("NODE_AFFINITY"));
builder.domain(new EdgeDomain(rs.getString("DOMAIN_NAME")));
builder.state(rs.getString("STATE"));
builder.ranking(Math.round(100.0*(1.0-rs.getDouble("RANK"))));
}
rs = stmt.executeQuery(STR."""
SELECT 1 FROM CRAWL_QUEUE
INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
WHERE EC_DOMAIN.ID=\{domainId}
""");
inCrawlQueue = rs.next();
builder.inCrawlQueue(inCrawlQueue);
rs = stmt.executeQuery(STR."""
SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=\{domainId}
""");
if (rs.next()) {
builder.incomingLinks(rs.getInt(1));
}
rs = stmt.executeQuery(STR."""
SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=\{domainId}
""");
if (rs.next()) {
builder.outboundLinks(rs.getInt(1));
outboundLinks = rs.getInt(1);
}
rs = stmt.executeQuery(STR."""
SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId}
""");
if (rs.next()) {
pagesVisited = rs.getInt("VISITED_URLS");
builder.pagesKnown(rs.getInt("KNOWN_URLS"));
builder.pagesIndexed(rs.getInt("GOOD_URLS"));
builder.pagesFetched(rs.getInt("VISITED_URLS"));
}
builder.suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue));
return Optional.of(builder.build());
}
catch (SQLException ex) {
logger.error("SQL error", ex);
return Optional.empty();
}
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.search.svc; package nu.marginalia.assistant.domains;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
@ -6,11 +6,10 @@ import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntDoubleHashMap; import gnu.trove.map.hash.TIntDoubleHashMap;
import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TLongDoubleHashMap;
import gnu.trove.set.TIntSet; import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.assistant.client.model.SimilarDomain;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -40,6 +39,8 @@ public class SimilarDomainsService {
public volatile double[] domainRanks = null; public volatile double[] domainRanks = null;
public volatile String[] domainNames = null; public volatile String[] domainNames = null;
volatile boolean isReady = false;
@Inject @Inject
public SimilarDomainsService(HikariDataSource dataSource) { public SimilarDomainsService(HikariDataSource dataSource) {
this.dataSource = dataSource; this.dataSource = dataSource;
@ -167,6 +168,7 @@ public class SimilarDomainsService {
logger.info("Loaded {} domains", domainRanks.length); logger.info("Loaded {} domains", domainRanks.length);
logger.info("All done!"); logger.info("All done!");
isReady = true;
} }
} }
catch (SQLException throwables) { catch (SQLException throwables) {
@ -174,7 +176,11 @@ public class SimilarDomainsService {
} }
} }
double getRelatedness(int a, int b) { public boolean isReady() {
return isReady;
}
private double getRelatedness(int a, int b) {
int lowerIndex = Math.min(domainIdToIdx.get(a), domainIdToIdx.get(b)); int lowerIndex = Math.min(domainIdToIdx.get(a), domainIdToIdx.get(b));
int higherIndex = Math.max(domainIdToIdx.get(a), domainIdToIdx.get(b)); int higherIndex = Math.max(domainIdToIdx.get(a), domainIdToIdx.get(b));
@ -233,14 +239,14 @@ public class SimilarDomainsService {
indexedDomains.get(idx), indexedDomains.get(idx),
activeDomains.get(idx), activeDomains.get(idx),
screenshotDomains.get(idx), screenshotDomains.get(idx),
LinkType.find( SimilarDomain.LinkType.find(
linkingIdsStoD.contains(idx), linkingIdsStoD.contains(idx),
linkingIdsDtoS.contains(idx) linkingIdsDtoS.contains(idx)
) )
)); ));
} }
domains.removeIf(d -> d.url.domain.toString().length() > 32); domains.removeIf(d -> d.url().domain.toString().length() > 32);
return domains; return domains;
} }
@ -319,84 +325,16 @@ public class SimilarDomainsService {
indexedDomains.get(idx), indexedDomains.get(idx),
activeDomains.get(idx), activeDomains.get(idx),
screenshotDomains.get(idx), screenshotDomains.get(idx),
LinkType.find( SimilarDomain.LinkType.find(
linkingIdsStoD.contains(idx), linkingIdsStoD.contains(idx),
linkingIdsDtoS.contains(idx) linkingIdsDtoS.contains(idx)
) )
)); ));
} }
domains.removeIf(d -> d.url.domain.toString().length() > 32); domains.removeIf(d -> d.url().domain.toString().length() > 32);
return domains; return domains;
} }
public record SimilarDomain(EdgeUrl url,
int domainId,
double relatedness,
double rank,
boolean indexed,
boolean active,
boolean screenshot,
LinkType linkType)
{
public String getRankSymbols() {
if (rank > 90) {
return "&#9733;&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 70) {
return "&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 50) {
return "&#9733;&#9733;&#9733;";
}
if (rank > 30) {
return "&#9733;&#9733;";
}
if (rank > 10) {
return "&#9733;";
}
return "";
}
}
enum LinkType {
BACKWARD,
FOWARD,
BIDIRECTIONAL,
NONE;
public static LinkType find(boolean linkStod,
boolean linkDtos)
{
if (linkDtos && linkStod)
return BIDIRECTIONAL;
if (linkDtos)
return FOWARD;
if (linkStod)
return BACKWARD;
return NONE;
}
public String toString() {
return switch (this) {
case FOWARD -> "&#8594;";
case BACKWARD -> "&#8592;";
case BIDIRECTIONAL -> "&#8646;";
case NONE -> "-";
};
}
public String getDescription() {
return switch (this) {
case BACKWARD -> "Backward Link";
case FOWARD -> "Forward Link";
case BIDIRECTIONAL -> "Mutual Link";
case NONE -> "No Link";
};
}
};
} }

View File

@ -20,8 +20,9 @@ import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
public class Suggestions { public class Suggestions {
private final PatriciaTrie<String> suggestionsTrie; private PatriciaTrie<String> suggestionsTrie = null;
private final TermFrequencyDict termFrequencyDict; private TermFrequencyDict termFrequencyDict = null;
private volatile boolean ready = false;
private final SpellChecker spellChecker; private final SpellChecker spellChecker;
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$"); private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
@ -35,10 +36,12 @@ public class Suggestions {
) { ) {
this.spellChecker = spellChecker; this.spellChecker = spellChecker;
suggestionsTrie = loadSuggestions(suggestionsFile); Thread.ofPlatform().start(() -> {
termFrequencyDict = dict; suggestionsTrie = loadSuggestions(suggestionsFile);
termFrequencyDict = dict;
logger.info("Loaded {} suggestions", suggestionsTrie.size()); ready = true;
logger.info("Loaded {} suggestions", suggestionsTrie.size());
});
} }
private static PatriciaTrie<String> loadSuggestions(Path file) { private static PatriciaTrie<String> loadSuggestions(Path file) {
@ -71,6 +74,9 @@ public class Suggestions {
} }
public List<String> getSuggestions(int count, String searchWord) { public List<String> getSuggestions(int count, String searchWord) {
if (!ready)
return Collections.emptyList();
if (searchWord.length() < MIN_SUGGEST_LENGTH) { if (searchWord.length() < MIN_SUGGEST_LENGTH) {
return Collections.emptyList(); return Collections.emptyList();
} }
@ -126,6 +132,9 @@ public class Suggestions {
public Stream<String> getSuggestionsForKeyword(int count, String prefix) { public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
if (!ready)
return Stream.empty();
if (prefix.length() < MIN_SUGGEST_LENGTH) { if (prefix.length() < MIN_SUGGEST_LENGTH) {
return Stream.empty(); return Stream.empty();
} }

View File

@ -25,7 +25,7 @@ public class RecrawlAllActor extends RecordActorPrototype {
public record Initial() implements ActorStep {} public record Initial() implements ActorStep {}
public record WaitFinished(int node) implements ActorStep {} public record WaitFinished(int node, long msgId) implements ActorStep {}
@Resume(behavior=ActorResumeBehavior.RETRY) @Resume(behavior=ActorResumeBehavior.RETRY)
public record Trigger(int node) implements ActorStep {} public record Trigger(int node) implements ActorStep {}
public record AdvanceNode(int node) implements ActorStep {} public record AdvanceNode(int node) implements ActorStep {}
@ -49,17 +49,18 @@ public class RecrawlAllActor extends RecordActorPrototype {
var data = new ExecutorRemoteActorFactory.CrawlData(activeFileStorage.get(0), true); var data = new ExecutorRemoteActorFactory.CrawlData(activeFileStorage.get(0), true);
if (remoteActorFactory.createCrawlRemote(node).trigger(data)) { long msgId = remoteActorFactory.createCrawlRemote(node).trigger(data);
yield new WaitFinished(node); if (msgId >= 0) {
yield new WaitFinished(node, msgId);
} }
else { else {
yield new AdvanceNode(node); yield new AdvanceNode(node);
} }
} }
case WaitFinished(int node) -> { case WaitFinished(int node, long msgId) -> {
var remoteActor = remoteActorFactory.createCrawlRemote(node); var remoteActor = remoteActorFactory.createCrawlRemote(node);
for (;;) { for (;;) {
var state = remoteActor.getState(); var state = remoteActor.getState(msgId);
if ("END".equals(state) || "ERROR".equals(state)) { if ("END".equals(state) || "ERROR".equals(state)) {
break; break;
} }
@ -80,8 +81,7 @@ public class RecrawlAllActor extends RecordActorPrototype {
public RecrawlAllActor(Gson gson, public RecrawlAllActor(Gson gson,
ExecutorRemoteActorFactory remoteActorFactory, ExecutorRemoteActorFactory remoteActorFactory,
FileStorageService fileStorageService, FileStorageService fileStorageService,
PrecessionNodes precessionNodes, PrecessionNodes precessionNodes)
NodeConfigurationService nodeConfigurationService)
{ {
super(gson); super(gson);
this.remoteActorFactory = remoteActorFactory; this.remoteActorFactory = remoteActorFactory;

View File

@ -24,7 +24,7 @@ public class ReprocessAllActor extends RecordActorPrototype {
public record Initial() implements ActorStep {} public record Initial() implements ActorStep {}
public record WaitFinished(int node) implements ActorStep {} public record WaitFinished(int node, long msgId) implements ActorStep {}
@Resume(behavior=ActorResumeBehavior.RETRY) @Resume(behavior=ActorResumeBehavior.RETRY)
public record Trigger(int node) implements ActorStep {} public record Trigger(int node) implements ActorStep {}
public record AdvanceNode(int node) implements ActorStep {} public record AdvanceNode(int node) implements ActorStep {}
@ -47,17 +47,18 @@ public class ReprocessAllActor extends RecordActorPrototype {
var data = new ExecutorRemoteActorFactory.ConvertAndLoadData(activeFileStorage.get(0)); var data = new ExecutorRemoteActorFactory.ConvertAndLoadData(activeFileStorage.get(0));
if (remoteActorFactory.createConvertAndLoadRemote(node).trigger(data)) { long msgId = remoteActorFactory.createConvertAndLoadRemote(node).trigger(data);
yield new WaitFinished(node); if (msgId >= 0) {
yield new WaitFinished(node, msgId);
} }
else { else {
yield new AdvanceNode(node); yield new AdvanceNode(node);
} }
} }
case WaitFinished(int node) -> { case WaitFinished(int node, long msgId) -> {
var remoteActor = remoteActorFactory.createConvertAndLoadRemote(node); var remoteActor = remoteActorFactory.createConvertAndLoadRemote(node);
for (;;) { for (;;) {
var state = remoteActor.getState(); var state = remoteActor.getState(msgId);
if ("END".equals(state) || "ERROR".equals(state)) if ("END".equals(state) || "ERROR".equals(state))
break; break;
TimeUnit.SECONDS.sleep(10); TimeUnit.SECONDS.sleep(10);

View File

@ -37,7 +37,7 @@ public class IndexSearchSetsService {
// Below are binary indices that are used to constrain a search // Below are binary indices that are used to constrain a search
private volatile RankingSearchSet retroSet; private volatile RankingSearchSet popularSet;
private volatile RankingSearchSet smallWebSet; private volatile RankingSearchSet smallWebSet;
private volatile RankingSearchSet academiaSet; private volatile RankingSearchSet academiaSet;
private volatile RankingSearchSet blogsSet; private volatile RankingSearchSet blogsSet;
@ -72,7 +72,7 @@ public class IndexSearchSetsService {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat")); academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat")); popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, servicesFactory.getSearchSetsBase().resolve("popular.dat"));
blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat")); blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat"));
} }
@ -86,7 +86,7 @@ public class IndexSearchSetsService {
} }
return switch (searchSetIdentifier) { return switch (searchSetIdentifier) {
case NONE -> anySet; case NONE -> anySet;
case RETRO -> retroSet; case POPULAR -> popularSet;
case ACADEMIA -> academiaSet; case ACADEMIA -> academiaSet;
case SMALLWEB -> smallWebSet; case SMALLWEB -> smallWebSet;
case BLOGS -> blogsSet; case BLOGS -> blogsSet;
@ -95,7 +95,7 @@ public class IndexSearchSetsService {
enum RepartitionSteps { enum RepartitionSteps {
UPDATE_ACADEMIA, UPDATE_ACADEMIA,
UPDATE_RETRO, UPDATE_POPULAR,
UPDATE_SMALL_WEB, UPDATE_SMALL_WEB,
UPDATE_BLOGS, UPDATE_BLOGS,
UPDATE_RANKINGS, UPDATE_RANKINGS,
@ -107,8 +107,8 @@ public class IndexSearchSetsService {
processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA); processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA);
updateAcademiaDomainsSet(); updateAcademiaDomainsSet();
processHeartbeat.progress(RepartitionSteps.UPDATE_RETRO); processHeartbeat.progress(RepartitionSteps.UPDATE_POPULAR);
updateRetroDomainsSet(); updatePopularDomainsSet();
processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB); processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB);
updateSmallWebDomainsSet(); updateSmallWebDomainsSet();
@ -139,15 +139,15 @@ public class IndexSearchSetsService {
} }
@SneakyThrows @SneakyThrows
public void updateRetroDomainsSet() { public void updatePopularDomainsSet() {
var entry = rankingSettings.retro; var entry = rankingSettings.retro;
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new)); var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new); var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
synchronized (this) { synchronized (this) {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data); popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, popularSet.source, data);
retroSet.write(); popularSet.write();
} }
} }

View File

@ -12,6 +12,7 @@ include 'code:services-application:dating-service'
include 'code:services-application:explorer-service' include 'code:services-application:explorer-service'
include 'code:libraries:array' include 'code:libraries:array'
include 'code:libraries:geo-ip'
include 'code:libraries:btree' include 'code:libraries:btree'
include 'code:libraries:easy-lsh' include 'code:libraries:easy-lsh'
include 'code:libraries:guarded-regex' include 'code:libraries:guarded-regex'