mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Merge branch 'master' into warc
This commit is contained in:
commit
45987a1d98
@ -1,12 +1,14 @@
|
||||
package nu.marginalia.assistant.client;
|
||||
|
||||
import com.google.gson.reflect.TypeToken;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import nu.marginalia.assistant.client.model.DictionaryResponse;
|
||||
import nu.marginalia.assistant.client.model.DomainInformation;
|
||||
import nu.marginalia.assistant.client.model.SimilarDomain;
|
||||
import nu.marginalia.client.AbstractDynamicClient;
|
||||
import nu.marginalia.client.exception.RouteNotConfiguredException;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.service.descriptor.ServiceDescriptors;
|
||||
import nu.marginalia.service.id.ServiceId;
|
||||
@ -14,6 +16,7 @@ import nu.marginalia.client.Context;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
@ -59,4 +62,31 @@ public class AssistantClient extends AbstractDynamicClient {
|
||||
return Observable.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public Observable<ArrayList<SimilarDomain>> similarDomains(Context ctx, int domainId, int count) {
|
||||
try {
|
||||
return super.get(ctx, 0, STR."/domain/\{domainId}/similar?count=\{count}", new TypeToken<ArrayList<SimilarDomain>>() {});
|
||||
}
|
||||
catch (RouteNotConfiguredException ex) {
|
||||
return Observable.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public Observable<ArrayList<SimilarDomain>> linkedDomains(Context ctx, int domainId, int count) {
|
||||
try {
|
||||
return super.get(ctx, 0, STR."/domain/\{domainId}/linking?count=\{count}", new TypeToken<ArrayList<SimilarDomain>>() {});
|
||||
}
|
||||
catch (RouteNotConfiguredException ex) {
|
||||
return Observable.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public Observable<DomainInformation> domainInformation(Context ctx, int domainId) {
|
||||
try {
|
||||
return super.get(ctx, 0, STR."/domain/\{domainId}/info", DomainInformation.class);
|
||||
}
|
||||
catch (RouteNotConfiguredException ex) {
|
||||
return Observable.empty();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,42 @@
|
||||
package nu.marginalia.assistant.client.model;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
|
||||
@ToString
|
||||
public class DomainInformation {
|
||||
EdgeDomain domain;
|
||||
|
||||
boolean blacklisted;
|
||||
int pagesKnown;
|
||||
int pagesFetched;
|
||||
int pagesIndexed;
|
||||
int incomingLinks;
|
||||
int outboundLinks;
|
||||
int nodeAffinity;
|
||||
double ranking;
|
||||
|
||||
boolean suggestForCrawling;
|
||||
boolean inCrawlQueue;
|
||||
boolean unknownDomain;
|
||||
|
||||
String ip;
|
||||
String ipCountry;
|
||||
String state;
|
||||
|
||||
public String getIpFlag() {
|
||||
if (ipCountry == null || ipCountry.isBlank()) {
|
||||
return "";
|
||||
}
|
||||
String country = ipCountry;
|
||||
if ("UK".equals(country)) {
|
||||
return "GB";
|
||||
}
|
||||
int offset = 0x1F1E6;
|
||||
int asciiOffset = 0x41;
|
||||
int firstChar = Character.codePointAt(country, 0) - asciiOffset + offset;
|
||||
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
|
||||
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
|
||||
}
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
package nu.marginalia.assistant.client.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
public record SimilarDomain(EdgeUrl url,
|
||||
int domainId,
|
||||
double relatedness,
|
||||
double rank,
|
||||
boolean indexed,
|
||||
boolean active,
|
||||
boolean screenshot,
|
||||
LinkType linkType) {
|
||||
|
||||
public String getRankSymbols() {
|
||||
if (rank > 90) {
|
||||
return "★★★★★";
|
||||
}
|
||||
if (rank > 70) {
|
||||
return "★★★★";
|
||||
}
|
||||
if (rank > 50) {
|
||||
return "★★★";
|
||||
}
|
||||
if (rank > 30) {
|
||||
return "★★";
|
||||
}
|
||||
if (rank > 10) {
|
||||
return "★";
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
public enum LinkType {
|
||||
BACKWARD,
|
||||
FOWARD,
|
||||
BIDIRECTIONAL,
|
||||
NONE;
|
||||
|
||||
public static LinkType find(boolean linkStod,
|
||||
boolean linkDtos) {
|
||||
if (linkDtos && linkStod)
|
||||
return BIDIRECTIONAL;
|
||||
if (linkDtos)
|
||||
return FOWARD;
|
||||
if (linkStod)
|
||||
return BACKWARD;
|
||||
|
||||
return NONE;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return switch (this) {
|
||||
case FOWARD -> "→";
|
||||
case BACKWARD -> "←";
|
||||
case BIDIRECTIONAL -> "⇆";
|
||||
case NONE -> "-";
|
||||
};
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return switch (this) {
|
||||
case BACKWARD -> "Backward Link";
|
||||
case FOWARD -> "Forward Link";
|
||||
case BIDIRECTIONAL -> "Mutual Link";
|
||||
case NONE -> "No Link";
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
@ -34,8 +34,23 @@ public class ExecutorRemoteActorFactory {
|
||||
}
|
||||
|
||||
public interface ExecutorRemoteActorIf<T> {
|
||||
boolean trigger(T object) throws Exception;
|
||||
String getState();
|
||||
|
||||
/** Trigger the remote actor with the given object. The object will be serialized to JSON and sent to the
|
||||
* remote actor. If the remote actor does not respond after a time period, a timeout will occur and a negative
|
||||
* message id will be returned.
|
||||
*
|
||||
* @param object The message to send to the remote actot
|
||||
* @return The message id of the response message, or a negative number if the remote actor did not respond
|
||||
* within a reasonable timeout seconds.
|
||||
*/
|
||||
long trigger(T object) throws Exception;
|
||||
|
||||
/** Get the last finished state of the actor.
|
||||
* <p>
|
||||
* The message id of the request initiating the actor must be provided to ensure that
|
||||
* we don't get a state from a previous run.
|
||||
*/
|
||||
String getState(long fromMsgId);
|
||||
}
|
||||
|
||||
public record CrawlData(FileStorageId storageId, boolean cascadeLoad) {}
|
||||
@ -58,11 +73,11 @@ class ExecutorRemoteActor<T> implements ExecutorRemoteActorFactory.ExecutorRemot
|
||||
this.triggerFunction = triggerFunction;
|
||||
}
|
||||
|
||||
public boolean trigger(T object) throws Exception {
|
||||
public long trigger(T object) throws Exception {
|
||||
return trigger(gson.toJson(object));
|
||||
}
|
||||
|
||||
public boolean trigger(String payload) throws Exception {
|
||||
public long trigger(String payload) throws Exception {
|
||||
long id = persistence.sendNewMessage(inboxName, null, null, triggerFunction, payload, null);
|
||||
|
||||
// Wait for the remote actor to respond to the message
|
||||
@ -70,19 +85,19 @@ class ExecutorRemoteActor<T> implements ExecutorRemoteActorFactory.ExecutorRemot
|
||||
for (int i = 0; i < 120; i++) {
|
||||
var msg = persistence.getMessage(id);
|
||||
if (msg.state() == MqMessageState.ACK || msg.state() == MqMessageState.OK)
|
||||
return true;
|
||||
return id;
|
||||
if (msg.state() == MqMessageState.ERR || msg.state() == MqMessageState.DEAD)
|
||||
return false;
|
||||
return -id;
|
||||
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
}
|
||||
|
||||
return false; // Timeout
|
||||
return -1; // Timeout
|
||||
}
|
||||
|
||||
public String getState() {
|
||||
public String getState(long fromMsgId) {
|
||||
return persistence
|
||||
.getHeadMessage(inboxName)
|
||||
.getHeadMessage(inboxName, fromMsgId)
|
||||
.map(MqMessage::function)
|
||||
.orElse("INITIAL");
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ package nu.marginalia.index.client.model.query;
|
||||
* */
|
||||
public enum SearchSetIdentifier {
|
||||
NONE,
|
||||
RETRO,
|
||||
POPULAR,
|
||||
BLOGS,
|
||||
ACADEMIA,
|
||||
SMALLWEB
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.client;
|
||||
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||
import io.reactivex.rxjava3.core.Scheduler;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import org.slf4j.Logger;
|
||||
@ -10,26 +9,16 @@ import javax.annotation.Nonnull;
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ThreadFactory;
|
||||
|
||||
public class AbortingScheduler {
|
||||
private final ThreadFactory threadFactory;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Nullable
|
||||
private ExecutorService executorService;
|
||||
|
||||
public AbortingScheduler(String name) {
|
||||
threadFactory = new ThreadFactoryBuilder()
|
||||
.setNameFormat(name+"client--%d")
|
||||
.setUncaughtExceptionHandler(this::handleException)
|
||||
.build();
|
||||
public AbortingScheduler() {
|
||||
}
|
||||
|
||||
private void handleException(Thread thread, Throwable throwable) {
|
||||
logger.error("Uncaught exception during Client IO in thread {}", thread.getName(), throwable);
|
||||
}
|
||||
|
||||
public synchronized Scheduler get() {
|
||||
return Schedulers.from(getExecutorService(),
|
||||
@ -40,14 +29,14 @@ public class AbortingScheduler {
|
||||
public synchronized void abort() {
|
||||
if (null != executorService) {
|
||||
executorService.shutdownNow();
|
||||
executorService = Executors.newFixedThreadPool(16, threadFactory);
|
||||
executorService = Executors.newVirtualThreadPerTaskExecutor();
|
||||
}
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
private synchronized ExecutorService getExecutorService() {
|
||||
if (null == executorService) {
|
||||
executorService = Executors.newFixedThreadPool(16, threadFactory);
|
||||
executorService = Executors.newVirtualThreadPerTaskExecutor();
|
||||
}
|
||||
return executorService;
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.client;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.reflect.TypeToken;
|
||||
import com.google.protobuf.GeneratedMessageV3;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import io.reactivex.rxjava3.core.ObservableSource;
|
||||
@ -20,6 +21,7 @@ import org.slf4j.LoggerFactory;
|
||||
import spark.utils.IOUtils;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.lang.reflect.Type;
|
||||
import java.net.ConnectException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Supplier;
|
||||
@ -233,6 +235,22 @@ public abstract class AbstractClient implements AutoCloseable {
|
||||
.doFinally(() -> ThreadContext.remove("outbound-request"));
|
||||
}
|
||||
|
||||
protected synchronized <T> Observable<T> get(Context ctx, int node, String endpoint, TypeToken<T> type) {
|
||||
ensureAlive(node);
|
||||
|
||||
var req = ctx.paint(new Request.Builder()).url(serviceRoutes.get(node) + endpoint).get().build();
|
||||
|
||||
return Observable.just(client.newCall(req))
|
||||
.subscribeOn(scheduler().get())
|
||||
.map(this::logInbound)
|
||||
.map(Call::execute)
|
||||
.map(this::logOutbound)
|
||||
.map(rsp -> validateResponseStatus(rsp, req, 200))
|
||||
.map(rsp -> getEntity(rsp, type))
|
||||
.retryWhen(this::retryHandler)
|
||||
.timeout(timeout, TimeUnit.SECONDS)
|
||||
.doFinally(() -> ThreadContext.remove("outbound-request"));
|
||||
}
|
||||
protected synchronized Observable<Integer> get(Context ctx, int node, String endpoint, OutputStream outputStream) {
|
||||
ensureAlive(node);
|
||||
|
||||
@ -388,6 +406,15 @@ public abstract class AbstractClient implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
@SneakyThrows
|
||||
private <T> T getEntity(Response response, TypeToken<T> clazz) {
|
||||
try (response) {
|
||||
return gson.fromJson(response.body().charStream(), clazz);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
@SneakyThrows
|
||||
private String getText(Response response) {
|
||||
try (response) {
|
||||
return response.body().string();
|
||||
|
@ -1,8 +1,6 @@
|
||||
package nu.marginalia.client;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.client.route.RouteProvider;
|
||||
import nu.marginalia.client.route.ServiceRoute;
|
||||
import nu.marginalia.service.descriptor.ServiceDescriptor;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
@ -20,7 +18,7 @@ public class AbstractDynamicClient extends AbstractClient {
|
||||
);
|
||||
|
||||
this.service = service;
|
||||
this.scheduler = new AbortingScheduler(name());
|
||||
this.scheduler = new AbortingScheduler();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -42,7 +42,7 @@ public class AbstractClientTest {
|
||||
client = new AbstractClient(new RouteProvider(new ServiceDescriptor(ServiceId.Api, "localhost")), 1, Gson::new) {
|
||||
@Override
|
||||
public AbortingScheduler scheduler() {
|
||||
return new AbortingScheduler(name());
|
||||
return new AbortingScheduler();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -15,6 +15,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:guarded-regex')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
|
||||
implementation libs.notnull
|
||||
|
||||
|
@ -1,72 +1,31 @@
|
||||
package nu.marginalia.ip_blocklist;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.opencsv.CSVReader;
|
||||
import com.opencsv.exceptions.CsvValidationException;
|
||||
import lombok.AllArgsConstructor;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
@Singleton
|
||||
public class GeoIpBlocklist {
|
||||
private final TreeMap<Long, GeoIpBlocklist.IpRange> ranges = new TreeMap<>();
|
||||
/** These countries are extremely overrepresented among the problematic and spammy domains,
|
||||
* and blocking them is by far the most effective spam mitigation technique. Sucks we throw
|
||||
* babies out with the bathwater, but it's undeniably effective.
|
||||
*/
|
||||
private final Set<String> blacklist = Set.of("CN", "HK");
|
||||
private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA");
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class);
|
||||
|
||||
@AllArgsConstructor
|
||||
static class IpRange {
|
||||
public final long from;
|
||||
public final long to;
|
||||
public final String country;
|
||||
}
|
||||
private final GeoIpDictionary ipDictionary;
|
||||
|
||||
public GeoIpBlocklist() throws IOException, CsvValidationException {
|
||||
var resource = WmsaHome.getIPLocationDatabse();
|
||||
|
||||
try (var reader = new CSVReader(new FileReader(resource.toFile()))) {
|
||||
for (;;) {
|
||||
String[] vals = reader.readNext();
|
||||
if (vals == null) {
|
||||
break;
|
||||
}
|
||||
if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) {
|
||||
continue;
|
||||
}
|
||||
var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]),
|
||||
Long.parseLong(vals[1]),
|
||||
vals[2]);
|
||||
ranges.put(range.from, range);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Loaded {} IP ranges", ranges.size());
|
||||
}
|
||||
|
||||
public String getCountry(InetAddress address) {
|
||||
byte[] bytes = address.getAddress();
|
||||
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
|
||||
|
||||
Long key = ranges.floorKey(ival);
|
||||
if (null == key) {
|
||||
return "-";
|
||||
}
|
||||
|
||||
var range = ranges.get(key);
|
||||
if (ival >= key && ival < range.to) {
|
||||
return range.country;
|
||||
}
|
||||
|
||||
return "-";
|
||||
@Inject
|
||||
public GeoIpBlocklist(GeoIpDictionary ipDictionary) {
|
||||
this.ipDictionary = ipDictionary;
|
||||
ipDictionary.waitReady();
|
||||
}
|
||||
|
||||
public boolean isAllowed(EdgeDomain domain) {
|
||||
@ -84,7 +43,7 @@ public class GeoIpBlocklist {
|
||||
|
||||
public String getCountry(EdgeDomain domain) {
|
||||
try {
|
||||
return getCountry(InetAddressCache.getAddress(domain));
|
||||
return ipDictionary.getCountry(InetAddressCache.getAddress(domain));
|
||||
}
|
||||
catch (Throwable ex) {
|
||||
logger.debug("Failed to resolve {}", domain);
|
||||
|
@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit;
|
||||
// We don't want to torture the DNS by resolving the same links over and over and over again
|
||||
|
||||
public class InetAddressCache {
|
||||
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
|
||||
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
|
||||
public static InetAddress getAddress(EdgeDomain domain) throws Throwable {
|
||||
try {
|
||||
return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress()));
|
||||
|
@ -0,0 +1,139 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
/**
|
||||
* Abstraction for exposing a (typically) read-from-disk -> parallel processing -> sequential output
|
||||
* workflow as an iterator, where the number of tasks is much larger than the number of cores
|
||||
*/
|
||||
public class ProcessingIterator<T> implements Iterator<T> {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ProcessingIterator.class);
|
||||
|
||||
private final LinkedBlockingQueue<T> queue;
|
||||
private final AtomicBoolean isFinished = new AtomicBoolean(false);
|
||||
private final ExecutorService executorService;
|
||||
private final Semaphore sem;
|
||||
|
||||
private T next = null;
|
||||
|
||||
private final int parallelism;
|
||||
|
||||
public ProcessingIterator(int queueSize, int parallelism, ProcessingJob<T> task) {
|
||||
this.parallelism = parallelism;
|
||||
|
||||
queue = new LinkedBlockingQueue<>(queueSize);
|
||||
executorService = Executors.newFixedThreadPool(parallelism);
|
||||
sem = new Semaphore(parallelism);
|
||||
|
||||
executorService.submit(() -> executeJob(task));
|
||||
}
|
||||
|
||||
private void executeJob(ProcessingJob<T> job) {
|
||||
try {
|
||||
job.run(this::executeTask);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
} finally {
|
||||
isFinished.set(true);
|
||||
}
|
||||
}
|
||||
|
||||
private void executeTask(Task<T> task) {
|
||||
try {
|
||||
sem.acquire();
|
||||
} catch (InterruptedException e) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
queue.put(task.get());
|
||||
} catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
} finally {
|
||||
sem.release();
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if there are more documents to be processed.
|
||||
* This method may block until we are certain this is true.
|
||||
* <p>
|
||||
* This method must be invoked from the same thread that invokes next(),
|
||||
* (or synchronize between the two)
|
||||
*/
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public boolean hasNext() {
|
||||
if (next != null)
|
||||
return true;
|
||||
|
||||
do {
|
||||
next = queue.poll(1, TimeUnit.SECONDS);
|
||||
if (next != null) {
|
||||
return true;
|
||||
}
|
||||
} while (expectMore());
|
||||
|
||||
if (!executorService.isShutdown()) {
|
||||
executorService.shutdown();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Heuristic for if we should expect more documents to be processed,
|
||||
* _trust but verify_ since we don't run this in an exclusive section
|
||||
* and may get a false positive. We never expect a false negative though.
|
||||
*/
|
||||
private boolean expectMore() {
|
||||
return !isFinished.get() // we are still reading from the database
|
||||
|| !queue.isEmpty() // ... or we have documents in the queue
|
||||
|| sem.availablePermits() < parallelism; // ... or we are still processing documents
|
||||
}
|
||||
|
||||
/** Returns the next document to be processed.
|
||||
* This method may block until we are certain there is a document to be processed.
|
||||
* <p>
|
||||
* This method must be invoked from the same thread that invokes hasNext(),
|
||||
* (or synchronize between the two)
|
||||
* <p>
|
||||
* If this is run after hasNext() returns false, a NoSuchElementException is thrown.
|
||||
*/
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public T next() {
|
||||
if (!hasNext()) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
|
||||
try {
|
||||
return next;
|
||||
}
|
||||
finally {
|
||||
next = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A job that produces a sequence of processing tasks that are to be
|
||||
* performed in parallel
|
||||
*/
|
||||
public interface ProcessingJob<T2> {
|
||||
void run(Consumer<Task<T2>> output) throws Exception;
|
||||
}
|
||||
|
||||
/**
|
||||
* A single task that produces a result to be iterable via the Iterator interface
|
||||
* (along with other tasks' outputs)
|
||||
*/
|
||||
public interface Task<T> {
|
||||
T get() throws Exception;
|
||||
}
|
||||
}
|
24
code/libraries/geo-ip/build.gradle
Normal file
24
code/libraries/geo-ip/build.gradle
Normal file
@ -0,0 +1,24 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(21))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.opencsv
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
6
code/libraries/geo-ip/readme.md
Normal file
6
code/libraries/geo-ip/readme.md
Normal file
@ -0,0 +1,6 @@
|
||||
This micro library handles the GeoIP lookups, mappings from IP addresses
|
||||
to country codes.
|
||||
|
||||
It uses the free ip2location lite database, which is
|
||||
available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country)
|
||||
under a CC-BY-SA 4.0 license.
|
@ -0,0 +1,90 @@
|
||||
package nu.marginalia.geoip;
|
||||
|
||||
import com.opencsv.CSVReader;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.FileReader;
|
||||
import java.net.InetAddress;
|
||||
import java.util.TreeMap;
|
||||
|
||||
public class GeoIpDictionary {
|
||||
private volatile TreeMap<Long, IpRange> ranges = null;
|
||||
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
|
||||
|
||||
record IpRange(long from, long to, String country) {}
|
||||
|
||||
public GeoIpDictionary() {
|
||||
Thread.ofPlatform().start(() -> {
|
||||
try (var reader = new CSVReader(new FileReader(WmsaHome.getIPLocationDatabse().toFile()))) {
|
||||
var dict = new TreeMap<Long, IpRange>();
|
||||
|
||||
for (;;) {
|
||||
String[] vals = reader.readNext();
|
||||
if (vals == null) {
|
||||
break;
|
||||
}
|
||||
var range = new IpRange(Long.parseLong(vals[0]),
|
||||
Long.parseLong(vals[1]),
|
||||
vals[2]);
|
||||
dict.put(range.from, range);
|
||||
}
|
||||
ranges = dict;
|
||||
logger.info("Loaded {} IP ranges", ranges.size());
|
||||
} catch (Exception e) {
|
||||
ranges = new TreeMap<>();
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
finally {
|
||||
this.notifyAll();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public boolean isReady() {
|
||||
return null != ranges;
|
||||
}
|
||||
|
||||
public boolean waitReady() {
|
||||
while (null == ranges) {
|
||||
try {
|
||||
synchronized (this) {
|
||||
this.wait(1000);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public String getCountry(String ip) {
|
||||
try {
|
||||
return getCountry(InetAddress.getByName(ip));
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public String getCountry(InetAddress address) {
|
||||
if (null == ranges) { // not loaded yet or failed to load
|
||||
return "";
|
||||
}
|
||||
|
||||
byte[] bytes = address.getAddress();
|
||||
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
|
||||
|
||||
Long key = ranges.floorKey(ival);
|
||||
if (null == key) {
|
||||
return "";
|
||||
}
|
||||
|
||||
var range = ranges.get(key);
|
||||
if (ival >= key && ival < range.to) {
|
||||
return range.country;
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
}
|
@ -496,17 +496,21 @@ public class MqPersistence {
|
||||
return gson;
|
||||
}
|
||||
|
||||
/** Returns the last message sent to this inbox with a state of 'OK' */
|
||||
public Optional<MqMessage> getHeadMessage(String inboxName) {
|
||||
/** Returns the last message sent to this inbox with a state of 'OK'
|
||||
* with an id greater than or equal to fromMsgId
|
||||
*/
|
||||
public Optional<MqMessage> getHeadMessage(String inboxName, long fromMsgId) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var query = conn.prepareStatement("""
|
||||
SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX
|
||||
FROM MESSAGE_QUEUE
|
||||
WHERE RECIPIENT_INBOX = ? AND STATE='OK'
|
||||
WHERE RECIPIENT_INBOX = ? AND STATE='OK' AND ID >= ?
|
||||
ORDER BY ID DESC LIMIT 1
|
||||
"""))
|
||||
{
|
||||
query.setString(1, inboxName);
|
||||
query.setLong(2, fromMsgId);
|
||||
|
||||
var rs = query.executeQuery();
|
||||
if (rs.next()) {
|
||||
long msgId = rs.getLong(1);
|
||||
|
@ -4,6 +4,7 @@ import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.bigstring.BigString;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@ -35,4 +36,16 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
return SERIAL_IDENTIFIER;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDomain() {
|
||||
if (url == null)
|
||||
return null;
|
||||
|
||||
return EdgeUrl
|
||||
.parse(url)
|
||||
.map(EdgeUrl::getDomain)
|
||||
.map(d -> d.domain)
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,4 +2,5 @@ package nu.marginalia.crawling.model;
|
||||
|
||||
public interface SerializableCrawlData {
|
||||
String getSerialIdentifier();
|
||||
String getDomain();
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.io.processed;
|
||||
import blue.strategic.parquet.HydratorSupplier;
|
||||
import blue.strategic.parquet.ParquetReader;
|
||||
import nu.marginalia.model.processed.DomainRecord;
|
||||
import nu.marginalia.model.processed.DomainWithIp;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -19,10 +20,10 @@ public class DomainRecordParquetFileReader {
|
||||
}
|
||||
|
||||
@NotNull
|
||||
public static List<String> getDomainNames(Path path) throws IOException {
|
||||
public static List<DomainWithIp> getBasicDomainInformation(Path path) throws IOException {
|
||||
return ParquetReader.streamContent(path.toFile(),
|
||||
HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()),
|
||||
List.of("domain"))
|
||||
List.of("domain", "ip"))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
@ -8,7 +8,6 @@ import org.apache.parquet.schema.*;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
import java.sql.Array;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@ -48,8 +47,8 @@ public class DomainRecord {
|
||||
return DomainRecord::dehydrate;
|
||||
}
|
||||
|
||||
public static Hydrator<String, String> newDomainNameHydrator() {
|
||||
return new DomainNameHydrator();
|
||||
public static Hydrator<DomainWithIp, DomainWithIp> newDomainNameHydrator() {
|
||||
return new DomainWithIpHydrator();
|
||||
}
|
||||
|
||||
|
||||
@ -124,23 +123,26 @@ class DomainHydrator implements Hydrator<DomainRecord, DomainRecord> {
|
||||
}
|
||||
}
|
||||
|
||||
class DomainNameHydrator implements Hydrator<String, String> {
|
||||
class DomainWithIpHydrator implements Hydrator<DomainWithIp, DomainWithIp> {
|
||||
|
||||
@Override
|
||||
public String start() {
|
||||
return "";
|
||||
public DomainWithIp start() {
|
||||
return new DomainWithIp();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String add(String target, String heading, Object value) {
|
||||
public DomainWithIp add(DomainWithIp target, String heading, Object value) {
|
||||
if ("domain".equals(heading)) {
|
||||
return (String) value;
|
||||
target.domain = (String) value;
|
||||
}
|
||||
else if ("ip".equals(heading)) {
|
||||
target.ip = (String) value;
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String finish(String target) {
|
||||
public DomainWithIp finish(DomainWithIp target) {
|
||||
return target;
|
||||
}
|
||||
}
|
@ -0,0 +1,15 @@
|
||||
package nu.marginalia.model.processed;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode
|
||||
@ToString
|
||||
public class DomainWithIp {
|
||||
public String domain;
|
||||
public String ip;
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.io.processed;
|
||||
|
||||
import nu.marginalia.model.processed.DomainRecord;
|
||||
import nu.marginalia.model.processed.DomainWithIp;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -53,8 +54,11 @@ class DomainRecordParquetFileReaderTest {
|
||||
writer.write(second);
|
||||
}
|
||||
|
||||
var domainNames = DomainRecordParquetFileReader.getDomainNames(parquetFile);
|
||||
assertEquals(List.of("www.marginalia.nu", "memex.marginalia.nu"), domainNames);
|
||||
var domainInfo = DomainRecordParquetFileReader.getBasicDomainInformation(parquetFile);
|
||||
assertEquals(List.of(
|
||||
new DomainWithIp("www.marginalia.nu", "127.0.0.1"),
|
||||
new DomainWithIp("memex.marginalia.nu", "127.0.0.1")),
|
||||
domainInfo);
|
||||
|
||||
var items = DomainRecordParquetFileReader
|
||||
.stream(parquetFile)
|
||||
|
@ -41,6 +41,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:libraries:guarded-regex')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
|
@ -138,10 +138,16 @@ public class ConverterMain {
|
||||
for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id)))
|
||||
{
|
||||
pool.submit(() -> {
|
||||
ProcessedDomain processed = processor.process(domain);
|
||||
converterWriter.accept(processed);
|
||||
|
||||
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
|
||||
try {
|
||||
ProcessedDomain processed = processor.process(domain);
|
||||
converterWriter.accept(processed);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error in processing", ex);
|
||||
}
|
||||
finally {
|
||||
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkGraph;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.*;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -22,6 +23,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class DomainProcessor {
|
||||
private final DocumentProcessor documentProcessor;
|
||||
@ -29,6 +31,7 @@ public class DomainProcessor {
|
||||
private final AnchorTagsSource anchorTagsSource;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final LshDocumentDeduplicator documentDeduplicator;
|
||||
private final GeoIpDictionary geoIpDictionary;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@ -37,13 +40,16 @@ public class DomainProcessor {
|
||||
SiteWords siteWords,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
AnchorTextKeywords anchorTextKeywords,
|
||||
LshDocumentDeduplicator documentDeduplicator) throws SQLException
|
||||
LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException
|
||||
{
|
||||
this.documentProcessor = documentProcessor;
|
||||
this.siteWords = siteWords;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.documentDeduplicator = documentDeduplicator;
|
||||
this.anchorTagsSource = anchorTagsSourceFactory.create();
|
||||
this.geoIpDictionary = geoIpDictionary;
|
||||
|
||||
geoIpDictionary.waitReady();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -54,11 +60,21 @@ public class DomainProcessor {
|
||||
boolean cookies = false;
|
||||
String ip = "";
|
||||
|
||||
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
|
||||
DomainLinks externalDomainLinks = null;
|
||||
|
||||
while (dataStream.hasNext()) {
|
||||
var data = dataStream.next();
|
||||
|
||||
// Do a lazy load of the external domain links since we don't know the domain
|
||||
// until we see the first document
|
||||
if (externalDomainLinks == null) {
|
||||
var domain = data.getDomain();
|
||||
|
||||
if (domain != null) {
|
||||
externalDomainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
}
|
||||
}
|
||||
|
||||
if (data instanceof CrawledDomain crawledDomain) {
|
||||
ret.domain = new EdgeDomain(crawledDomain.domain);
|
||||
ret.ip = crawledDomain.ip;
|
||||
@ -76,8 +92,15 @@ public class DomainProcessor {
|
||||
try {
|
||||
if (doc.url == null)
|
||||
continue;
|
||||
|
||||
fixBadCanonicalTag(doc);
|
||||
|
||||
// This case should never be reachable, as we should have initiated
|
||||
// the externalDomainLinks variable above if we made it past the
|
||||
// doc.url == null check; but we'll leave it here just in case
|
||||
// to make debugging easier if we break this.
|
||||
assert externalDomainLinks != null : "externalDomainLinks has not been initialized";
|
||||
|
||||
docs.add(documentProcessor.process(doc, externalDomainLinks));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@ -89,11 +112,22 @@ public class DomainProcessor {
|
||||
// Add late keywords and features from domain-level information
|
||||
|
||||
List<String> terms = new ArrayList<>();
|
||||
|
||||
terms.add("ip:"+ip);
|
||||
|
||||
String ipCountryCode = geoIpDictionary.getCountry(ip).toLowerCase();
|
||||
if (!ipCountryCode.isBlank()) {
|
||||
terms.add("ip:"+ipCountryCode);
|
||||
}
|
||||
|
||||
if (cookies) {
|
||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||
}
|
||||
|
||||
if (isAcademicDomain(ret.domain)) {
|
||||
terms.add("special:academia");
|
||||
}
|
||||
|
||||
for (var document : ret.documents) {
|
||||
if (document.details == null)
|
||||
continue;
|
||||
@ -114,6 +148,19 @@ public class DomainProcessor {
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
|
||||
private boolean isAcademicDomain(EdgeDomain domain) {
|
||||
|
||||
if (domain.domain.endsWith(".edu"))
|
||||
return true;
|
||||
|
||||
if (academicPattern.matcher(domain.domain).matches())
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private void fixBadCanonicalTag(CrawledDocument doc) {
|
||||
// Some sites have a canonical tag that points to a different domain,
|
||||
// but our loader can not support this, so we point these back to the
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
|
||||
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
|
||||
@ -21,6 +22,7 @@ public class SideloadSourceFactory {
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentKeywordExtractor documentKeywordExtractor;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
|
||||
private final WarcSideloadFactory warcSideloadFactory;
|
||||
@ -29,7 +31,7 @@ public class SideloadSourceFactory {
|
||||
public SideloadSourceFactory(Gson gson,
|
||||
SideloaderProcessing sideloaderProcessing,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor documentKeywordExtractor,
|
||||
DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
DirtreeSideloaderFactory dirtreeSideloaderFactory,
|
||||
WarcSideloadFactory warcSideloadFactory) {
|
||||
@ -37,13 +39,14 @@ public class SideloadSourceFactory {
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.documentKeywordExtractor = documentKeywordExtractor;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
|
||||
this.warcSideloadFactory = warcSideloadFactory;
|
||||
}
|
||||
|
||||
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
|
||||
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing);
|
||||
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing);
|
||||
}
|
||||
|
||||
public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException {
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.encyclopedia;
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
@ -14,6 +15,7 @@ import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.util.ProcessingIterator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -27,15 +29,10 @@ import java.nio.file.Path;
|
||||
import java.sql.*;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
/** This is an experimental sideloader for encyclopedia.marginalia.nu's database;
|
||||
* (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code)
|
||||
*
|
||||
* <p>
|
||||
* See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data
|
||||
*/
|
||||
public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable {
|
||||
@ -43,6 +40,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
private final Connection connection;
|
||||
private final EdgeUrl baseUrl;
|
||||
private final Gson gson;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class);
|
||||
@ -51,9 +49,11 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
String baseUrl,
|
||||
Gson gson,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
AnchorTextKeywords anchorTextKeywords,
|
||||
SideloaderProcessing sideloaderProcessing) throws SQLException {
|
||||
this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new);
|
||||
this.gson = gson;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString();
|
||||
|
||||
@ -76,62 +76,24 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Iterator<ProcessedDocument> getDocumentsStream() {
|
||||
LinkedBlockingQueue<ProcessedDocument> docs = new LinkedBlockingQueue<>(32);
|
||||
AtomicBoolean isFinished = new AtomicBoolean(false);
|
||||
return new ProcessingIterator<>(24, 16, (taskConsumer) -> {
|
||||
DomainLinks domainLinks = getDomainLinks();
|
||||
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(16);
|
||||
Semaphore sem = new Semaphore(16);
|
||||
var stmt = connection.prepareStatement("""
|
||||
SELECT url,title,html FROM articles
|
||||
""");
|
||||
stmt.setFetchSize(100);
|
||||
|
||||
DomainLinks domainLinks = getDomainLinks();
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
var stmt = connection.prepareStatement("""
|
||||
SELECT url,title,html FROM articles
|
||||
""");
|
||||
stmt.setFetchSize(100);
|
||||
while (rs.next()) {
|
||||
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
|
||||
String title = rs.getString("title");
|
||||
String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8);
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
|
||||
String title = rs.getString("title");
|
||||
String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8);
|
||||
|
||||
sem.acquire();
|
||||
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
docs.add(convertDocument(articleParts.parts, title, url, domainLinks));
|
||||
} catch (URISyntaxException | DisqualifiedException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
sem.release();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
stmt.close();
|
||||
}
|
||||
catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
finally {
|
||||
isFinished.set(true);
|
||||
taskConsumer.accept(() -> convertDocument(articleParts.parts, title, url, domainLinks));
|
||||
}
|
||||
});
|
||||
|
||||
return new Iterator<>() {
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return !isFinished.get() || !docs.isEmpty() || sem.availablePermits() < 16;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public ProcessedDocument next() {
|
||||
return docs.take();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private DomainLinks getDomainLinks() {
|
||||
@ -142,30 +104,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
logger.error("Failed to create anchor tags source", ex);
|
||||
return new DomainLinks();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException {
|
||||
var stmt = connection.prepareStatement("""
|
||||
SELECT url,title,html
|
||||
FROM articles
|
||||
WHERE url=?
|
||||
""");
|
||||
stmt.setFetchSize(100);
|
||||
stmt.setString(1, url);
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
if (rs.next()) {
|
||||
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
|
||||
String title = rs.getString("title");
|
||||
|
||||
return convertDocument(articleParts.parts,
|
||||
title,
|
||||
URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8),
|
||||
new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data.
|
||||
);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
|
||||
@ -180,13 +118,22 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
}
|
||||
fullHtml.append("</body></html>");
|
||||
|
||||
return sideloaderProcessing
|
||||
var doc = sideloaderProcessing
|
||||
.processDocument(fullUrl,
|
||||
fullHtml.toString(),
|
||||
List.of("encyclopedia", "wiki"),
|
||||
domainLinks,
|
||||
GeneratorType.WIKI,
|
||||
10_000_000);
|
||||
|
||||
// Add anchor text keywords
|
||||
if (doc.isProcessedFully()) {
|
||||
doc.words.addAnchorTerms(
|
||||
anchorTextKeywords.getAnchorTextKeywords(domainLinks, doc.url)
|
||||
);
|
||||
}
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
private <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {
|
||||
|
@ -6,6 +6,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
@ -78,7 +79,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
|
||||
}
|
||||
|
||||
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();
|
||||
|
@ -1,38 +1,12 @@
|
||||
package nu.marginalia.converting.sideload.encyclopedia;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Guice;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import nu.marginalia.io.processed.DocumentRecordParquetFileReader;
|
||||
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.processed.DocumentRecord;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
class EncyclopediaMarginaliaNuSideloaderTest {
|
||||
Path tempFile;
|
||||
@ -62,93 +36,5 @@ class EncyclopediaMarginaliaNuSideloaderTest {
|
||||
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L)));
|
||||
System.out.printf("%64s\n", Long.toBinaryString(0x10L));
|
||||
}
|
||||
@Test
|
||||
public void debugSpecificArticle() throws SQLException, IOException, URISyntaxException, DisqualifiedException {
|
||||
Path pathToDbFile = Path.of("/home/vlofgren/Code/MarginaliaSearch/run/samples/articles.db");
|
||||
if (!Files.exists(pathToDbFile)) {
|
||||
// not really practical to ship a 40 Gb sqlite files on github
|
||||
// be @vlofgren to run this test
|
||||
return;
|
||||
}
|
||||
var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
|
||||
when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
|
||||
var processing = Guice.createInjector(new ConverterModule(),
|
||||
new AbstractModule() {
|
||||
public void configure() {
|
||||
bind(ConverterDomainTypes.class).toInstance(domainTypesMock);
|
||||
}
|
||||
}
|
||||
)
|
||||
.getInstance(SideloaderProcessing.class);
|
||||
|
||||
var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class);
|
||||
when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks());
|
||||
|
||||
var sideloader = new EncyclopediaMarginaliaNuSideloader(
|
||||
pathToDbFile,
|
||||
"https://en.wikipedia.org/wiki/",
|
||||
GsonFactory.get(),
|
||||
atagsFactory,
|
||||
processing
|
||||
);
|
||||
|
||||
var document = sideloader.processJust("Don't_Tell_Me_(Madonna_song)");
|
||||
|
||||
System.out.println(document);
|
||||
|
||||
var keywordsBuilt = document.words.build();
|
||||
|
||||
var ptr = keywordsBuilt.newPointer();
|
||||
|
||||
Map<String, WordMetadata> dirtyAndBlues = new HashMap<>();
|
||||
|
||||
while (ptr.advancePointer()) {
|
||||
String word = ptr.getKeyword();
|
||||
|
||||
System.out.println(word + ": " + Long.toHexString(Long.reverseBytes(ptr.getMetadata())));
|
||||
|
||||
if (Set.of("dirty", "blues").contains(word)) {
|
||||
WordMetadata meta = new WordMetadata(ptr.getMetadata());
|
||||
|
||||
Assertions.assertNull(
|
||||
dirtyAndBlues.put(word, meta)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Assertions.assertTrue(dirtyAndBlues.containsKey("dirty"));
|
||||
Assertions.assertTrue(dirtyAndBlues.containsKey("blues"));
|
||||
Assertions.assertNotEquals(
|
||||
dirtyAndBlues.get("dirty"),
|
||||
dirtyAndBlues.get("blues")
|
||||
);
|
||||
|
||||
try (var dw = new DocumentRecordParquetFileWriter(tempFile)) {
|
||||
dw.write(new DocumentRecord(
|
||||
"encyclopedia.marginalia.nu",
|
||||
document.url.toString(),
|
||||
0,
|
||||
document.state.toString(),
|
||||
document.stateReason,
|
||||
document.details.title,
|
||||
document.details.description,
|
||||
HtmlFeature.encode(document.details.features),
|
||||
document.details.standard.name(),
|
||||
document.details.length,
|
||||
document.details.hashCode,
|
||||
(float) document.details.quality,
|
||||
document.details.metadata.encode(),
|
||||
document.details.pubYear,
|
||||
List.of(keywordsBuilt.keywords),
|
||||
new TLongArrayList(keywordsBuilt.metadata)
|
||||
));
|
||||
}
|
||||
|
||||
var record = DocumentRecordParquetFileReader.streamKeywordsProjection(tempFile).findFirst().get();
|
||||
String[] words = record.words.toArray(String[]::new);
|
||||
long[] meta = record.metas.toArray();
|
||||
|
||||
assertArrayEquals(keywordsBuilt.keywords, words);
|
||||
assertArrayEquals(keywordsBuilt.metadata, meta);
|
||||
}
|
||||
}
|
@ -0,0 +1,38 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class ProcessingIteratorTest {
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
Set<Integer> output = new HashSet<>();
|
||||
var iter = new ProcessingIterator<Integer>(2, 2, q -> {
|
||||
for (int i = 0; i < 10_000; i++) {
|
||||
int j = i;
|
||||
q.accept(() -> task(j));
|
||||
}
|
||||
});
|
||||
while (iter.hasNext()) {
|
||||
output.add(iter.next());
|
||||
}
|
||||
|
||||
assertEquals(10_000, output.size());
|
||||
|
||||
for (int i = 0; i < 10_000; i++) {
|
||||
assertTrue(output.contains(i));
|
||||
}
|
||||
}
|
||||
|
||||
int task(int n) throws InterruptedException {
|
||||
TimeUnit.NANOSECONDS.sleep(10);
|
||||
return n;
|
||||
}
|
||||
}
|
@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
||||
@ -51,6 +52,7 @@ public class CrawlerMain {
|
||||
|
||||
private final ProcessHeartbeatImpl heartbeat;
|
||||
private final MessageQueueFactory messageQueueFactory;
|
||||
private final DomainProber domainProber;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final DbCrawlSpecProvider dbCrawlSpecProvider;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
@ -70,7 +72,7 @@ public class CrawlerMain {
|
||||
@Inject
|
||||
public CrawlerMain(UserAgent userAgent,
|
||||
ProcessHeartbeatImpl heartbeat,
|
||||
MessageQueueFactory messageQueueFactory,
|
||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||
FileStorageService fileStorageService,
|
||||
ProcessConfiguration processConfiguration,
|
||||
DbCrawlSpecProvider dbCrawlSpecProvider,
|
||||
@ -78,6 +80,7 @@ public class CrawlerMain {
|
||||
Gson gson) {
|
||||
this.heartbeat = heartbeat;
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.domainProber = domainProber;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.dbCrawlSpecProvider = dbCrawlSpecProvider;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
@ -211,14 +214,13 @@ public class CrawlerMain {
|
||||
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id);
|
||||
var warcRecorder = new WarcRecorder(); // write to a temp file for now
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, warcRecorder, writer::accept);
|
||||
var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder, writer::accept);
|
||||
CrawlDataReference reference = getReference())
|
||||
{
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
||||
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
|
||||
|
||||
int size = retreiver.fetch(domainLinks, reference);
|
||||
|
||||
workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);
|
||||
|
@ -45,7 +45,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
||||
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
||||
|
||||
private static final DomainProber domainProber = new DomainProber();
|
||||
private final DomainProber domainProber;
|
||||
private final SitemapRetriever sitemapRetriever;
|
||||
private final DomainCrawlFrontier crawlFrontier;
|
||||
private final WarcRecorder warcRecorder;
|
||||
@ -59,12 +59,14 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
private static final String documentWasSameTag = "SAME-BY-COMPARISON";
|
||||
|
||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||
DomainProber domainProber,
|
||||
CrawlSpecRecord specs,
|
||||
WarcRecorder warcRecorder,
|
||||
Consumer<SerializableCrawlData> writer)
|
||||
{
|
||||
this.warcRecorder = warcRecorder;
|
||||
this.fetcher = fetcher;
|
||||
this.domainProber = domainProber;
|
||||
|
||||
domain = specs.domain;
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
||||
@ -11,17 +13,21 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
@Singleton
|
||||
public class DomainProber {
|
||||
private final Logger logger = LoggerFactory.getLogger(DomainProber.class);
|
||||
private static IpBlockList ipBlockList;
|
||||
private final Predicate<EdgeDomain> domainBlacklist;
|
||||
|
||||
static {
|
||||
try {
|
||||
ipBlockList = new IpBlockList(new GeoIpBlocklist());
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
@Inject
|
||||
public DomainProber(IpBlockList ipBlockList) {
|
||||
this.domainBlacklist = ipBlockList::isAllowed;
|
||||
}
|
||||
|
||||
/** For testing */
|
||||
public DomainProber(Predicate<EdgeDomain> domainBlacklist) {
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
}
|
||||
|
||||
/** To detect problems early we do a probing request to the domain before we start crawling it properly.
|
||||
@ -37,7 +43,7 @@ public class DomainProber {
|
||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
|
||||
}
|
||||
|
||||
if (!ipBlockList.isAllowed(firstUrlInQueue.domain))
|
||||
if (!domainBlacklist.test(firstUrlInQueue.domain))
|
||||
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
|
||||
|
||||
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
|
||||
@ -62,7 +68,7 @@ public class DomainProber {
|
||||
/** This domain redirects to another domain */
|
||||
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
|
||||
|
||||
/** If the retreivala of the probed url was successful, return the url as it was fetched
|
||||
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||
*
|
||||
* @param probedUrl The url we successfully probed
|
||||
|
@ -90,7 +90,10 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
@Inject
|
||||
public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
|
||||
public HttpFetcherImpl(@Named("user-agent") String userAgent,
|
||||
Dispatcher dispatcher,
|
||||
ConnectionPool connectionPool)
|
||||
{
|
||||
this.client = createClient(dispatcher, connectionPool);
|
||||
this.userAgent = userAgent;
|
||||
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.*;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
@ -65,7 +66,7 @@ public class CrawlerMockFetcherTest {
|
||||
|
||||
void crawl(CrawlSpecRecord spec, Consumer<SerializableCrawlData> consumer) throws IOException {
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
new CrawlerRetreiver(fetcherMock, spec, recorder, consumer)
|
||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder, consumer)
|
||||
.fetch();
|
||||
}
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
@ -59,7 +60,7 @@ class CrawlerRetreiverTest {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder(tempFile)) {
|
||||
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
|
||||
} catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
@ -103,7 +104,7 @@ class CrawlerRetreiverTest {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
@ -137,7 +138,7 @@ class CrawlerRetreiverTest {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
@ -178,7 +179,7 @@ class CrawlerRetreiverTest {
|
||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
new CrawlerRetreiver(httpFetcher, specs, recorder, d -> {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> {
|
||||
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
|
||||
if (d instanceof CrawledDocument doc) {
|
||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||
@ -202,7 +203,7 @@ class CrawlerRetreiverTest {
|
||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
new CrawlerRetreiver(httpFetcher, specs, recorder, d -> {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> {
|
||||
if (d instanceof CrawledDocument doc) {
|
||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||
}
|
||||
|
@ -9,6 +9,7 @@ import nu.marginalia.io.processed.DomainRecordParquetFileReader;
|
||||
import nu.marginalia.loading.LoaderInputData;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.processed.DomainRecord;
|
||||
import nu.marginalia.model.processed.DomainWithIp;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -51,9 +52,9 @@ public class DomainLoaderService {
|
||||
) {
|
||||
|
||||
try (var inserter = new DomainInserter(conn, nodeId)) {
|
||||
for (var domain : readSetDomainNames(inputData)) {
|
||||
inserter.accept(new EdgeDomain(domain));
|
||||
domainNamesAll.add(domain);
|
||||
for (var domainWithIp : readBasicDomainInformation(inputData)) {
|
||||
inserter.accept(new EdgeDomain(domainWithIp.domain));
|
||||
domainNamesAll.add(domainWithIp.domain);
|
||||
}
|
||||
}
|
||||
try (var inserter = new DomainInserter(conn, -1)) {
|
||||
@ -63,9 +64,9 @@ public class DomainLoaderService {
|
||||
}
|
||||
}
|
||||
|
||||
try (var updater = new DomainAffinityUpdater(conn, nodeId)) {
|
||||
for (var domain : readSetDomainNames(inputData)) {
|
||||
updater.accept(new EdgeDomain(domain));
|
||||
try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId)) {
|
||||
for (var domainWithIp : readBasicDomainInformation(inputData)) {
|
||||
updater.accept(new EdgeDomain(domainWithIp.domain), domainWithIp.ip);
|
||||
}
|
||||
}
|
||||
|
||||
@ -84,15 +85,15 @@ public class DomainLoaderService {
|
||||
return ret;
|
||||
}
|
||||
|
||||
Collection<String> readSetDomainNames(LoaderInputData inputData) throws IOException {
|
||||
final Set<String> domainNamesAll = new HashSet<>(100_000);
|
||||
Collection<DomainWithIp> readBasicDomainInformation(LoaderInputData inputData) throws IOException {
|
||||
final Set<DomainWithIp> domainsAll = new HashSet<>(100_000);
|
||||
|
||||
var domainFiles = inputData.listDomainFiles();
|
||||
for (var file : domainFiles) {
|
||||
domainNamesAll.addAll(DomainRecordParquetFileReader.getDomainNames(file));
|
||||
domainsAll.addAll(DomainRecordParquetFileReader.getBasicDomainInformation(file));
|
||||
}
|
||||
|
||||
return domainNamesAll;
|
||||
return domainsAll;
|
||||
}
|
||||
|
||||
Collection<String> readReferencedDomainNames(LoaderInputData inputData) throws IOException {
|
||||
@ -164,20 +165,25 @@ public class DomainLoaderService {
|
||||
statement.close();
|
||||
}
|
||||
}
|
||||
private static class DomainAffinityUpdater implements AutoCloseable {
|
||||
private static class DomainAffinityAndIpUpdater implements AutoCloseable {
|
||||
private final PreparedStatement statement;
|
||||
private final int nodeAffinity;
|
||||
|
||||
private int count = 0;
|
||||
|
||||
public DomainAffinityUpdater(Connection connection, int affinity) throws SQLException {
|
||||
public DomainAffinityAndIpUpdater(Connection connection, int affinity) throws SQLException {
|
||||
this.nodeAffinity = affinity;
|
||||
statement = connection.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE DOMAIN_NAME=?");
|
||||
statement = connection.prepareStatement("""
|
||||
UPDATE EC_DOMAIN
|
||||
SET NODE_AFFINITY = ?, IP = ?
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
}
|
||||
|
||||
public void accept(EdgeDomain domain) throws SQLException {
|
||||
public void accept(EdgeDomain domain, String ip) throws SQLException {
|
||||
statement.setInt(1, nodeAffinity);
|
||||
statement.setString(2, domain.toString());
|
||||
statement.setString(2, ip);
|
||||
statement.setString(3, domain.toString());
|
||||
statement.addBatch();
|
||||
|
||||
if (++count > 1000) {
|
||||
|
@ -6,7 +6,6 @@ import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter;
|
||||
import nu.marginalia.io.processed.DomainRecordParquetFileWriter;
|
||||
import nu.marginalia.io.processed.ProcessedDataFileNames;
|
||||
import nu.marginalia.loader.DbTestUtil;
|
||||
import nu.marginalia.loading.LoaderInputData;
|
||||
import nu.marginalia.model.processed.DomainLinkRecord;
|
||||
import nu.marginalia.model.processed.DomainRecord;
|
||||
@ -21,10 +20,8 @@ import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
@ -99,7 +96,7 @@ class DomainLoaderServiceTest {
|
||||
|
||||
// Verify
|
||||
Set<String> expectedDomains1 = Sets.union(new HashSet<>(domains1), new HashSet<>(domains2));
|
||||
assertEquals(expectedDomains1, domainService.readSetDomainNames(new LoaderInputData(workDir, 2)));
|
||||
assertEquals(expectedDomains1, domainService.readBasicDomainInformation(new LoaderInputData(workDir, 2)).stream().map(d -> d.domain).collect(Collectors.toSet()));
|
||||
|
||||
Set<String> expectedDomains2 = new HashSet<>(linkDomains);
|
||||
assertEquals(expectedDomains2, domainService.readReferencedDomainNames(new LoaderInputData(workDir, 2)));
|
||||
|
@ -10,8 +10,6 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.query.client.QueryClient;
|
||||
import nu.marginalia.query.model.QueryParams;
|
||||
@ -64,7 +62,7 @@ public class ApiSearchOperator {
|
||||
return switch (index) {
|
||||
case 0 -> SearchSetIdentifier.NONE;
|
||||
case 1 -> SearchSetIdentifier.SMALLWEB;
|
||||
case 2 -> SearchSetIdentifier.RETRO;
|
||||
case 2 -> SearchSetIdentifier.POPULAR;
|
||||
case 3 -> SearchSetIdentifier.NONE;
|
||||
case 5 -> SearchSetIdentifier.NONE;
|
||||
default -> SearchSetIdentifier.NONE;
|
||||
|
@ -31,7 +31,6 @@ public class SearchService extends Service {
|
||||
SearchFrontPageService frontPageService,
|
||||
SearchErrorPageService errorPageService,
|
||||
SearchAddToCrawlQueueService addToCrawlQueueService,
|
||||
SearchFlagSiteService flagSiteService,
|
||||
SearchSiteInfoService siteInfoService,
|
||||
SearchQueryService searchQueryService
|
||||
) {
|
||||
|
@ -1,28 +0,0 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
|
||||
@ToString
|
||||
public class DomainInformation {
|
||||
EdgeDomain domain;
|
||||
|
||||
boolean blacklisted;
|
||||
int pagesKnown;
|
||||
int pagesFetched;
|
||||
int pagesIndexed;
|
||||
int incomingLinks;
|
||||
int outboundLinks;
|
||||
int nodeAffinity;
|
||||
double ranking;
|
||||
|
||||
boolean suggestForCrawling;
|
||||
boolean inCrawlQueue;
|
||||
boolean unknownDomain;
|
||||
|
||||
String state;
|
||||
List<EdgeDomain> linkingDomains;
|
||||
}
|
@ -5,7 +5,6 @@ import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.command.SearchAdtechParameter;
|
||||
import nu.marginalia.search.command.SearchJsParameter;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import org.apache.regexp.RE;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -37,7 +36,7 @@ public class SearchFilters {
|
||||
filterGroups = List.of(
|
||||
List.of(
|
||||
new Filter("No Filter", SearchProfile.NO_FILTER, parameters),
|
||||
new Filter("Popular", SearchProfile.DEFAULT, parameters),
|
||||
new Filter("Popular", SearchProfile.POPULAR, parameters),
|
||||
new Filter("Small Web", SearchProfile.SMALLWEB, parameters),
|
||||
new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters),
|
||||
new Filter("Academia", SearchProfile.ACADEMIA, parameters)
|
||||
|
@ -8,19 +8,16 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import java.util.Objects;
|
||||
|
||||
public enum SearchProfile {
|
||||
DEFAULT("default", SearchSetIdentifier.RETRO),
|
||||
POPULAR("default", SearchSetIdentifier.POPULAR),
|
||||
SMALLWEB("modern", SearchSetIdentifier.SMALLWEB),
|
||||
BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS),
|
||||
NO_FILTER("corpo", SearchSetIdentifier.NONE),
|
||||
YOLO("yolo", SearchSetIdentifier.NONE),
|
||||
VINTAGE("vintage", SearchSetIdentifier.NONE),
|
||||
TILDE("tilde", SearchSetIdentifier.NONE),
|
||||
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
|
||||
ACADEMIA("academia", SearchSetIdentifier.ACADEMIA),
|
||||
ACADEMIA("academia", SearchSetIdentifier.NONE),
|
||||
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
|
||||
FOOD("food", SearchSetIdentifier.NONE),
|
||||
CRAFTS("crafts", SearchSetIdentifier.NONE),
|
||||
CLASSICS("classics", SearchSetIdentifier.NONE),
|
||||
FOOD("food", SearchSetIdentifier.POPULAR),
|
||||
FORUM("forum", SearchSetIdentifier.NONE),
|
||||
WIKI("wiki", SearchSetIdentifier.NONE),
|
||||
DOCS("docs", SearchSetIdentifier.NONE),
|
||||
@ -38,7 +35,7 @@ public enum SearchProfile {
|
||||
private final static SearchProfile[] values = values();
|
||||
public static SearchProfile getSearchProfile(String param) {
|
||||
if (null == param) {
|
||||
return NO_FILTER;
|
||||
return POPULAR;
|
||||
}
|
||||
|
||||
for (var profile : values) {
|
||||
@ -47,12 +44,12 @@ public enum SearchProfile {
|
||||
}
|
||||
}
|
||||
|
||||
return NO_FILTER;
|
||||
return POPULAR;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchSubquery subquery) {
|
||||
if (this == ACADEMIA) {
|
||||
subquery.searchTermsPriority.add("tld:edu");
|
||||
subquery.searchTermsAdvice.add("special:academia");
|
||||
}
|
||||
if (this == VINTAGE) {
|
||||
subquery.searchTermsPriority.add("format:html123");
|
||||
@ -75,9 +72,7 @@ public enum SearchProfile {
|
||||
}
|
||||
if (this == FOOD) {
|
||||
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
||||
}
|
||||
if (this == CRAFTS) {
|
||||
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword());
|
||||
subquery.searchTermsExclude.add("special:ads");
|
||||
}
|
||||
}
|
||||
|
||||
@ -106,13 +101,5 @@ public enum SearchProfile {
|
||||
else return SpecificationLimit.none();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public String getNearDomain() {
|
||||
if (this == CLASSICS) {
|
||||
return "classics.mit.edu";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,276 +0,0 @@
|
||||
package nu.marginalia.search.siteinfo;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.search.model.DomainInformation;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
/*
|
||||
TODO: This class needs to be refactored, a lot of
|
||||
these SQL queries are redundant and can be
|
||||
collapsed into one single query that fetches
|
||||
all the information
|
||||
*/
|
||||
@Singleton
|
||||
public class DomainInformationService {
|
||||
|
||||
private DbDomainQueries dbDomainQueries;
|
||||
private HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public DomainInformationService(
|
||||
DbDomainQueries dbDomainQueries,
|
||||
HikariDataSource dataSource) {
|
||||
this.dbDomainQueries = dbDomainQueries;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainInformation> domainInfo(String site) {
|
||||
|
||||
OptionalInt maybeDomainId = getDomainFromPartial(site);
|
||||
if (maybeDomainId.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
int domainId = maybeDomainId.getAsInt();
|
||||
|
||||
Optional<EdgeDomain> domain = dbDomainQueries.getDomain(domainId);
|
||||
if (domain.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
boolean blacklisted = isBlacklisted(domain.get());
|
||||
int pagesKnown = getPagesKnown(domainId);
|
||||
int pagesVisited = getPagesVisited(domainId);
|
||||
int pagesIndexed = getPagesIndexed(domainId);
|
||||
int incomingLinks = getIncomingLinks(domainId);
|
||||
int outboundLinks = getOutboundLinks(domainId);
|
||||
int nodeAffinity = getNodeAffinity(domainId);
|
||||
boolean inCrawlQueue = inCrawlQueue(domainId);
|
||||
|
||||
double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100;
|
||||
|
||||
DomainIndexingState state = getDomainState(domainId);
|
||||
List<EdgeDomain> linkingDomains = getLinkingDomains(domainId);
|
||||
|
||||
var di = DomainInformation.builder()
|
||||
.domain(domain.get())
|
||||
.blacklisted(blacklisted)
|
||||
.pagesKnown(pagesKnown)
|
||||
.pagesFetched(pagesVisited)
|
||||
.pagesIndexed(pagesIndexed)
|
||||
.incomingLinks(incomingLinks)
|
||||
.outboundLinks(outboundLinks)
|
||||
.ranking(rank)
|
||||
.state(state.desc)
|
||||
.linkingDomains(linkingDomains)
|
||||
.inCrawlQueue(inCrawlQueue)
|
||||
.nodeAffinity(nodeAffinity)
|
||||
.suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue))
|
||||
.build();
|
||||
|
||||
return Optional.of(di);
|
||||
}
|
||||
|
||||
private int getNodeAffinity(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE ID=?
|
||||
""")) {
|
||||
stmt.setInt(1, domainId);
|
||||
var rs = stmt.executeQuery();
|
||||
if (rs.next())
|
||||
return rs.getInt(1);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("SQL error", ex);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private boolean inCrawlQueue(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
SELECT 1 FROM CRAWL_QUEUE
|
||||
INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=?
|
||||
"""))
|
||||
{
|
||||
stmt.setInt(1, domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
return rsp.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private OptionalInt getDomainFromPartial(String site) {
|
||||
return dbDomainQueries.tryGetDomainId(new EdgeDomain(site));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public boolean isBlacklisted(EdgeDomain domain) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN IN (?,?)")) {
|
||||
stmt.setString(1, domain.domain);
|
||||
stmt.setString(2, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
return rsp.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public int getPagesKnown(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public int getPagesVisited(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public int getPagesIndexed(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public int getIncomingLinks(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
|
||||
stmt.setInt(1, domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@SneakyThrows
|
||||
public int getOutboundLinks(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
|
||||
stmt.setInt(1, domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public DomainIndexingState getDomainState(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return DomainIndexingState.valueOf(rsp.getString(1));
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return DomainIndexingState.ERROR;
|
||||
}
|
||||
|
||||
public List<EdgeDomain> getLinkingDomains(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
List<EdgeDomain> results = new ArrayList<>(25);
|
||||
try (var stmt = connection.prepareStatement("SELECT SOURCE_DOMAIN FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
|
||||
stmt.setInt(1, domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
results.add(new EdgeDomain(rsp.getString(1)));
|
||||
}
|
||||
return results;
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
public double getRank(int domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getDouble(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
}
|
@ -1,16 +1,16 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.assistant.client.AssistantClient;
|
||||
import nu.marginalia.assistant.client.model.SimilarDomain;
|
||||
import nu.marginalia.client.Context;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.model.DomainInformation;
|
||||
import nu.marginalia.assistant.client.model.DomainInformation;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.siteinfo.DomainInformationService;
|
||||
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
@ -23,22 +23,19 @@ import java.util.Map;
|
||||
public class SearchSiteInfoService {
|
||||
|
||||
private final SearchOperator searchOperator;
|
||||
private final SimilarDomainsService similarDomains;
|
||||
private final DomainInformationService domainInformationService;
|
||||
private final AssistantClient assistantClient;
|
||||
private final SearchFlagSiteService flagSiteService;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
|
||||
@Inject
|
||||
public SearchSiteInfoService(SearchOperator searchOperator,
|
||||
SimilarDomainsService similarDomains,
|
||||
DomainInformationService domainInformationService,
|
||||
AssistantClient assistantClient,
|
||||
RendererFactory rendererFactory,
|
||||
SearchFlagSiteService flagSiteService,
|
||||
DbDomainQueries domainQueries) throws IOException {
|
||||
this.searchOperator = searchOperator;
|
||||
this.similarDomains = similarDomains;
|
||||
this.domainInformationService = domainInformationService;
|
||||
this.assistantClient = assistantClient;
|
||||
this.flagSiteService = flagSiteService;
|
||||
this.domainQueries = domainQueries;
|
||||
|
||||
@ -108,13 +105,6 @@ public class SearchSiteInfoService {
|
||||
false);
|
||||
}
|
||||
|
||||
private DomainInformation dummyInformation(String domainName) {
|
||||
return DomainInformation.builder()
|
||||
.domain(new EdgeDomain(domainName))
|
||||
.suggestForCrawling(true)
|
||||
.unknownDomain(true)
|
||||
.build();
|
||||
}
|
||||
|
||||
private Backlinks listLinks(Context ctx, String domainName) {
|
||||
return new Backlinks(domainName,
|
||||
@ -126,13 +116,24 @@ public class SearchSiteInfoService {
|
||||
|
||||
final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
|
||||
|
||||
final DomainInformation domainInfo = domainInformationService.domainInfo(domainName)
|
||||
.orElseGet(() -> dummyInformation(domainName));
|
||||
final DomainInformation domainInfo;
|
||||
final List<SimilarDomain> similarSet;
|
||||
final List<SimilarDomain> linkingDomains;
|
||||
|
||||
final List<SimilarDomainsService.SimilarDomain> similarSet =
|
||||
similarDomains.getSimilarDomains(domainId, 100);
|
||||
final List<SimilarDomainsService.SimilarDomain> linkingDomains =
|
||||
similarDomains.getLinkingDomains(domainId, 100);
|
||||
if (domainId < 0 || !assistantClient.isAccepting()) {
|
||||
domainInfo = createDummySiteInfo(domainName);
|
||||
similarSet = List.of();
|
||||
linkingDomains = List.of();
|
||||
}
|
||||
else {
|
||||
domainInfo = assistantClient.domainInformation(ctx, domainId).blockingFirst();
|
||||
similarSet = assistantClient
|
||||
.similarDomains(ctx, domainId, 100)
|
||||
.blockingFirst();
|
||||
linkingDomains = assistantClient
|
||||
.linkedDomains(ctx, domainId, 100)
|
||||
.blockingFirst();
|
||||
}
|
||||
|
||||
return new SiteInfoWithContext(domainName,
|
||||
domainId,
|
||||
@ -141,6 +142,15 @@ public class SearchSiteInfoService {
|
||||
linkingDomains
|
||||
);
|
||||
}
|
||||
|
||||
private DomainInformation createDummySiteInfo(String domainName) {
|
||||
return DomainInformation.builder()
|
||||
.domain(new EdgeDomain(domainName))
|
||||
.suggestForCrawling(true)
|
||||
.unknownDomain(true)
|
||||
.build();
|
||||
}
|
||||
|
||||
private Docs listDocs(Context ctx, String domainName) {
|
||||
return new Docs(domainName,
|
||||
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
||||
@ -181,13 +191,13 @@ public class SearchSiteInfoService {
|
||||
String domain,
|
||||
long domainId,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomainsService.SimilarDomain> similar,
|
||||
List<SimilarDomainsService.SimilarDomain> linking) {
|
||||
List<SimilarDomain> similar,
|
||||
List<SimilarDomain> linking) {
|
||||
public SiteInfoWithContext(String domain,
|
||||
long domainId,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomainsService.SimilarDomain> similar,
|
||||
List<SimilarDomainsService.SimilarDomain> linking
|
||||
List<SimilarDomain> similar,
|
||||
List<SimilarDomain> linking
|
||||
)
|
||||
{
|
||||
this(Map.of("info", true),
|
||||
|
@ -99,7 +99,6 @@
|
||||
</section>
|
||||
<section id="legal">
|
||||
<h1>Policies</h1>
|
||||
|
||||
This website complies with the GDPR by <em>not collecting any personal
|
||||
information</em>, and with the EU Cookie Directive by <em>not using
|
||||
cookies</em>. <a href="https://memex.marginalia.nu/projects/edge/privacy.gmi">More Information</a>.
|
||||
@ -109,8 +108,13 @@
|
||||
<h1> Open Source </h1>
|
||||
The search engine is open source with an AGPL license. The sources can be perused at
|
||||
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
|
||||
|
||||
<h1>Data Sources</h1>
|
||||
IP geolocation is sourced from the IP2Location LITE data available from
|
||||
<a rel="external noopener nofollow" href="https://lite.ip2location.com/">https://lite.ip2location.com/</a>
|
||||
under
|
||||
<a rel="external noopener nofollow" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA 4.0</a>.
|
||||
</section>
|
||||
|
||||
</footer>
|
||||
|
||||
<script src="/tts.js"></script>
|
||||
|
@ -6,5 +6,6 @@
|
||||
Pages Known: {{pagesKnown}} <br/>
|
||||
Pages Crawled: {{pagesFetched}} <br/>
|
||||
Pages Indexed: {{pagesIndexed}} <br/>
|
||||
IP: {{ip}} {{#if ipCountry}}<span title="{{ipCountry}}">{{getIpFlag}}</span>{{/if}}<br/>
|
||||
</fieldset>
|
||||
<br/>
|
@ -27,11 +27,13 @@ dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:service-discovery')
|
||||
implementation project(':code:common:service-client')
|
||||
|
||||
implementation project(':code:features-search:screenshots')
|
||||
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
|
@ -3,6 +3,8 @@ package nu.marginalia.assistant;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.assistant.domains.DomainInformationService;
|
||||
import nu.marginalia.assistant.domains.SimilarDomainsService;
|
||||
import nu.marginalia.assistant.eval.Units;
|
||||
import nu.marginalia.assistant.suggest.Suggestions;
|
||||
import nu.marginalia.assistant.eval.MathParser;
|
||||
@ -16,11 +18,16 @@ import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Objects;
|
||||
|
||||
public class AssistantService extends Service {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = GsonFactory.get();
|
||||
private final Units units;
|
||||
private final MathParser mathParser;
|
||||
private final SimilarDomainsService similarDomainsService;
|
||||
private final DomainInformationService domainInformationService;
|
||||
private final Suggestions suggestions;
|
||||
|
||||
@SneakyThrows
|
||||
@ -30,12 +37,16 @@ public class AssistantService extends Service {
|
||||
MathParser mathParser,
|
||||
Units units,
|
||||
ScreenshotService screenshotService,
|
||||
SimilarDomainsService similarDomainsService,
|
||||
DomainInformationService domainInformationService,
|
||||
Suggestions suggestions)
|
||||
{
|
||||
super(params);
|
||||
|
||||
this.mathParser = mathParser;
|
||||
this.units = units;
|
||||
this.similarDomainsService = similarDomainsService;
|
||||
this.domainInformationService = domainInformationService;
|
||||
this.suggestions = suggestions;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
@ -56,12 +67,50 @@ public class AssistantService extends Service {
|
||||
rsp,
|
||||
req.queryParams("value")
|
||||
));
|
||||
|
||||
Spark.get("/domain/:id/similar", this::getSimilarDomains, this::convertToJson);
|
||||
Spark.get("/domain/:id/linking", this::getLinkingDomains, this::convertToJson);
|
||||
Spark.get("/domain/:id/info", this::getDomainInformation, this::convertToJson);
|
||||
Spark.get("/public/suggest/", this::getSuggestions, this::convertToJson);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
private Object getSimilarDomains(Request request, Response response) {
|
||||
int domainId = Integer.parseInt(request.params("id"));
|
||||
int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25"));
|
||||
|
||||
response.type("application/json");
|
||||
|
||||
if (!similarDomainsService.isReady()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return similarDomainsService.getSimilarDomains(domainId, count);
|
||||
}
|
||||
|
||||
private Object getLinkingDomains(Request request, Response response) {
|
||||
int domainId = Integer.parseInt(request.params("id"));
|
||||
int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25"));
|
||||
|
||||
response.type("application/json");
|
||||
if (!similarDomainsService.isReady()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
return similarDomainsService.getLinkingDomains(domainId, count);
|
||||
}
|
||||
|
||||
private Object getDomainInformation(Request request, Response response) {
|
||||
int domainId = Integer.parseInt(request.params("id"));
|
||||
|
||||
response.type("application/json");
|
||||
|
||||
var maybeDomainInfo = domainInformationService.domainInfo(domainId);
|
||||
if (maybeDomainInfo.isEmpty()) {
|
||||
Spark.halt(404);
|
||||
}
|
||||
return maybeDomainInfo.get();
|
||||
}
|
||||
|
||||
private Object getSuggestions(Request request, Response response) {
|
||||
response.type("application/json");
|
||||
var param = request.queryParams("partial");
|
||||
|
@ -0,0 +1,115 @@
|
||||
package nu.marginalia.assistant.domains;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.assistant.client.model.DomainInformation;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
@Singleton
|
||||
public class DomainInformationService {
|
||||
private final GeoIpDictionary geoIpDictionary;
|
||||
|
||||
private DbDomainQueries dbDomainQueries;
|
||||
private HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public DomainInformationService(
|
||||
DbDomainQueries dbDomainQueries,
|
||||
GeoIpDictionary geoIpDictionary,
|
||||
HikariDataSource dataSource) {
|
||||
this.dbDomainQueries = dbDomainQueries;
|
||||
this.geoIpDictionary = geoIpDictionary;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainInformation> domainInfo(int domainId) {
|
||||
|
||||
Optional<EdgeDomain> domain = dbDomainQueries.getDomain(domainId);
|
||||
if (domain.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
var builder = DomainInformation.builder();
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.createStatement();
|
||||
) {
|
||||
boolean inCrawlQueue;
|
||||
int outboundLinks = 0;
|
||||
int pagesVisited = 0;
|
||||
|
||||
ResultSet rs;
|
||||
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT IP, NODE_AFFINITY, DOMAIN_NAME, STATE, IFNULL(RANK, 1) AS RANK
|
||||
FROM EC_DOMAIN WHERE ID=\{domainId}
|
||||
""");
|
||||
if (rs.next()) {
|
||||
String ip = rs.getString("IP");
|
||||
|
||||
builder.ip(ip);
|
||||
builder.ipCountry(geoIpDictionary.getCountry(ip));
|
||||
|
||||
builder.nodeAffinity(rs.getInt("NODE_AFFINITY"));
|
||||
builder.domain(new EdgeDomain(rs.getString("DOMAIN_NAME")));
|
||||
builder.state(rs.getString("STATE"));
|
||||
builder.ranking(Math.round(100.0*(1.0-rs.getDouble("RANK"))));
|
||||
}
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT 1 FROM CRAWL_QUEUE
|
||||
INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=\{domainId}
|
||||
""");
|
||||
inCrawlQueue = rs.next();
|
||||
builder.inCrawlQueue(inCrawlQueue);
|
||||
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=\{domainId}
|
||||
""");
|
||||
if (rs.next()) {
|
||||
builder.incomingLinks(rs.getInt(1));
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=\{domainId}
|
||||
""");
|
||||
if (rs.next()) {
|
||||
builder.outboundLinks(rs.getInt(1));
|
||||
outboundLinks = rs.getInt(1);
|
||||
}
|
||||
|
||||
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId}
|
||||
""");
|
||||
if (rs.next()) {
|
||||
pagesVisited = rs.getInt("VISITED_URLS");
|
||||
|
||||
builder.pagesKnown(rs.getInt("KNOWN_URLS"));
|
||||
builder.pagesIndexed(rs.getInt("GOOD_URLS"));
|
||||
builder.pagesFetched(rs.getInt("VISITED_URLS"));
|
||||
}
|
||||
|
||||
builder.suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue));
|
||||
|
||||
return Optional.of(builder.build());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("SQL error", ex);
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.search.svc;
|
||||
package nu.marginalia.assistant.domains;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
@ -6,11 +6,10 @@ import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntDoubleHashMap;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TLongDoubleHashMap;
|
||||
import gnu.trove.set.TIntSet;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.assistant.client.model.SimilarDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -40,6 +39,8 @@ public class SimilarDomainsService {
|
||||
public volatile double[] domainRanks = null;
|
||||
public volatile String[] domainNames = null;
|
||||
|
||||
volatile boolean isReady = false;
|
||||
|
||||
@Inject
|
||||
public SimilarDomainsService(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
@ -167,6 +168,7 @@ public class SimilarDomainsService {
|
||||
|
||||
logger.info("Loaded {} domains", domainRanks.length);
|
||||
logger.info("All done!");
|
||||
isReady = true;
|
||||
}
|
||||
}
|
||||
catch (SQLException throwables) {
|
||||
@ -174,7 +176,11 @@ public class SimilarDomainsService {
|
||||
}
|
||||
}
|
||||
|
||||
double getRelatedness(int a, int b) {
|
||||
public boolean isReady() {
|
||||
return isReady;
|
||||
}
|
||||
|
||||
private double getRelatedness(int a, int b) {
|
||||
int lowerIndex = Math.min(domainIdToIdx.get(a), domainIdToIdx.get(b));
|
||||
int higherIndex = Math.max(domainIdToIdx.get(a), domainIdToIdx.get(b));
|
||||
|
||||
@ -233,14 +239,14 @@ public class SimilarDomainsService {
|
||||
indexedDomains.get(idx),
|
||||
activeDomains.get(idx),
|
||||
screenshotDomains.get(idx),
|
||||
LinkType.find(
|
||||
SimilarDomain.LinkType.find(
|
||||
linkingIdsStoD.contains(idx),
|
||||
linkingIdsDtoS.contains(idx)
|
||||
)
|
||||
));
|
||||
}
|
||||
|
||||
domains.removeIf(d -> d.url.domain.toString().length() > 32);
|
||||
domains.removeIf(d -> d.url().domain.toString().length() > 32);
|
||||
|
||||
return domains;
|
||||
}
|
||||
@ -319,84 +325,16 @@ public class SimilarDomainsService {
|
||||
indexedDomains.get(idx),
|
||||
activeDomains.get(idx),
|
||||
screenshotDomains.get(idx),
|
||||
LinkType.find(
|
||||
SimilarDomain.LinkType.find(
|
||||
linkingIdsStoD.contains(idx),
|
||||
linkingIdsDtoS.contains(idx)
|
||||
)
|
||||
));
|
||||
}
|
||||
|
||||
domains.removeIf(d -> d.url.domain.toString().length() > 32);
|
||||
domains.removeIf(d -> d.url().domain.toString().length() > 32);
|
||||
|
||||
return domains;
|
||||
}
|
||||
|
||||
public record SimilarDomain(EdgeUrl url,
|
||||
int domainId,
|
||||
double relatedness,
|
||||
double rank,
|
||||
boolean indexed,
|
||||
boolean active,
|
||||
boolean screenshot,
|
||||
LinkType linkType)
|
||||
{
|
||||
|
||||
public String getRankSymbols() {
|
||||
if (rank > 90) {
|
||||
return "★★★★★";
|
||||
}
|
||||
if (rank > 70) {
|
||||
return "★★★★";
|
||||
}
|
||||
if (rank > 50) {
|
||||
return "★★★";
|
||||
}
|
||||
if (rank > 30) {
|
||||
return "★★";
|
||||
}
|
||||
if (rank > 10) {
|
||||
return "★";
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
enum LinkType {
|
||||
BACKWARD,
|
||||
FOWARD,
|
||||
BIDIRECTIONAL,
|
||||
NONE;
|
||||
|
||||
public static LinkType find(boolean linkStod,
|
||||
boolean linkDtos)
|
||||
{
|
||||
if (linkDtos && linkStod)
|
||||
return BIDIRECTIONAL;
|
||||
if (linkDtos)
|
||||
return FOWARD;
|
||||
if (linkStod)
|
||||
return BACKWARD;
|
||||
|
||||
return NONE;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return switch (this) {
|
||||
case FOWARD -> "→";
|
||||
case BACKWARD -> "←";
|
||||
case BIDIRECTIONAL -> "⇆";
|
||||
case NONE -> "-";
|
||||
};
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return switch (this) {
|
||||
case BACKWARD -> "Backward Link";
|
||||
case FOWARD -> "Forward Link";
|
||||
case BIDIRECTIONAL -> "Mutual Link";
|
||||
case NONE -> "No Link";
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
}
|
@ -20,8 +20,9 @@ import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class Suggestions {
|
||||
private final PatriciaTrie<String> suggestionsTrie;
|
||||
private final TermFrequencyDict termFrequencyDict;
|
||||
private PatriciaTrie<String> suggestionsTrie = null;
|
||||
private TermFrequencyDict termFrequencyDict = null;
|
||||
private volatile boolean ready = false;
|
||||
private final SpellChecker spellChecker;
|
||||
|
||||
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
|
||||
@ -35,10 +36,12 @@ public class Suggestions {
|
||||
) {
|
||||
this.spellChecker = spellChecker;
|
||||
|
||||
suggestionsTrie = loadSuggestions(suggestionsFile);
|
||||
termFrequencyDict = dict;
|
||||
|
||||
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
||||
Thread.ofPlatform().start(() -> {
|
||||
suggestionsTrie = loadSuggestions(suggestionsFile);
|
||||
termFrequencyDict = dict;
|
||||
ready = true;
|
||||
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
||||
});
|
||||
}
|
||||
|
||||
private static PatriciaTrie<String> loadSuggestions(Path file) {
|
||||
@ -71,6 +74,9 @@ public class Suggestions {
|
||||
}
|
||||
|
||||
public List<String> getSuggestions(int count, String searchWord) {
|
||||
if (!ready)
|
||||
return Collections.emptyList();
|
||||
|
||||
if (searchWord.length() < MIN_SUGGEST_LENGTH) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
@ -126,6 +132,9 @@ public class Suggestions {
|
||||
|
||||
|
||||
public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
|
||||
if (!ready)
|
||||
return Stream.empty();
|
||||
|
||||
if (prefix.length() < MIN_SUGGEST_LENGTH) {
|
||||
return Stream.empty();
|
||||
}
|
||||
|
@ -25,7 +25,7 @@ public class RecrawlAllActor extends RecordActorPrototype {
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
|
||||
public record WaitFinished(int node) implements ActorStep {}
|
||||
public record WaitFinished(int node, long msgId) implements ActorStep {}
|
||||
@Resume(behavior=ActorResumeBehavior.RETRY)
|
||||
public record Trigger(int node) implements ActorStep {}
|
||||
public record AdvanceNode(int node) implements ActorStep {}
|
||||
@ -49,17 +49,18 @@ public class RecrawlAllActor extends RecordActorPrototype {
|
||||
|
||||
var data = new ExecutorRemoteActorFactory.CrawlData(activeFileStorage.get(0), true);
|
||||
|
||||
if (remoteActorFactory.createCrawlRemote(node).trigger(data)) {
|
||||
yield new WaitFinished(node);
|
||||
long msgId = remoteActorFactory.createCrawlRemote(node).trigger(data);
|
||||
if (msgId >= 0) {
|
||||
yield new WaitFinished(node, msgId);
|
||||
}
|
||||
else {
|
||||
yield new AdvanceNode(node);
|
||||
}
|
||||
}
|
||||
case WaitFinished(int node) -> {
|
||||
case WaitFinished(int node, long msgId) -> {
|
||||
var remoteActor = remoteActorFactory.createCrawlRemote(node);
|
||||
for (;;) {
|
||||
var state = remoteActor.getState();
|
||||
var state = remoteActor.getState(msgId);
|
||||
if ("END".equals(state) || "ERROR".equals(state)) {
|
||||
break;
|
||||
}
|
||||
@ -80,8 +81,7 @@ public class RecrawlAllActor extends RecordActorPrototype {
|
||||
public RecrawlAllActor(Gson gson,
|
||||
ExecutorRemoteActorFactory remoteActorFactory,
|
||||
FileStorageService fileStorageService,
|
||||
PrecessionNodes precessionNodes,
|
||||
NodeConfigurationService nodeConfigurationService)
|
||||
PrecessionNodes precessionNodes)
|
||||
{
|
||||
super(gson);
|
||||
this.remoteActorFactory = remoteActorFactory;
|
||||
|
@ -24,7 +24,7 @@ public class ReprocessAllActor extends RecordActorPrototype {
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
|
||||
public record WaitFinished(int node) implements ActorStep {}
|
||||
public record WaitFinished(int node, long msgId) implements ActorStep {}
|
||||
@Resume(behavior=ActorResumeBehavior.RETRY)
|
||||
public record Trigger(int node) implements ActorStep {}
|
||||
public record AdvanceNode(int node) implements ActorStep {}
|
||||
@ -47,17 +47,18 @@ public class ReprocessAllActor extends RecordActorPrototype {
|
||||
|
||||
var data = new ExecutorRemoteActorFactory.ConvertAndLoadData(activeFileStorage.get(0));
|
||||
|
||||
if (remoteActorFactory.createConvertAndLoadRemote(node).trigger(data)) {
|
||||
yield new WaitFinished(node);
|
||||
long msgId = remoteActorFactory.createConvertAndLoadRemote(node).trigger(data);
|
||||
if (msgId >= 0) {
|
||||
yield new WaitFinished(node, msgId);
|
||||
}
|
||||
else {
|
||||
yield new AdvanceNode(node);
|
||||
}
|
||||
}
|
||||
case WaitFinished(int node) -> {
|
||||
case WaitFinished(int node, long msgId) -> {
|
||||
var remoteActor = remoteActorFactory.createConvertAndLoadRemote(node);
|
||||
for (;;) {
|
||||
var state = remoteActor.getState();
|
||||
var state = remoteActor.getState(msgId);
|
||||
if ("END".equals(state) || "ERROR".equals(state))
|
||||
break;
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
|
@ -37,7 +37,7 @@ public class IndexSearchSetsService {
|
||||
|
||||
|
||||
// Below are binary indices that are used to constrain a search
|
||||
private volatile RankingSearchSet retroSet;
|
||||
private volatile RankingSearchSet popularSet;
|
||||
private volatile RankingSearchSet smallWebSet;
|
||||
private volatile RankingSearchSet academiaSet;
|
||||
private volatile RankingSearchSet blogsSet;
|
||||
@ -72,7 +72,7 @@ public class IndexSearchSetsService {
|
||||
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
||||
popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, servicesFactory.getSearchSetsBase().resolve("popular.dat"));
|
||||
blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat"));
|
||||
}
|
||||
|
||||
@ -86,7 +86,7 @@ public class IndexSearchSetsService {
|
||||
}
|
||||
return switch (searchSetIdentifier) {
|
||||
case NONE -> anySet;
|
||||
case RETRO -> retroSet;
|
||||
case POPULAR -> popularSet;
|
||||
case ACADEMIA -> academiaSet;
|
||||
case SMALLWEB -> smallWebSet;
|
||||
case BLOGS -> blogsSet;
|
||||
@ -95,7 +95,7 @@ public class IndexSearchSetsService {
|
||||
|
||||
enum RepartitionSteps {
|
||||
UPDATE_ACADEMIA,
|
||||
UPDATE_RETRO,
|
||||
UPDATE_POPULAR,
|
||||
UPDATE_SMALL_WEB,
|
||||
UPDATE_BLOGS,
|
||||
UPDATE_RANKINGS,
|
||||
@ -107,8 +107,8 @@ public class IndexSearchSetsService {
|
||||
processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA);
|
||||
updateAcademiaDomainsSet();
|
||||
|
||||
processHeartbeat.progress(RepartitionSteps.UPDATE_RETRO);
|
||||
updateRetroDomainsSet();
|
||||
processHeartbeat.progress(RepartitionSteps.UPDATE_POPULAR);
|
||||
updatePopularDomainsSet();
|
||||
|
||||
processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB);
|
||||
updateSmallWebDomainsSet();
|
||||
@ -139,15 +139,15 @@ public class IndexSearchSetsService {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateRetroDomainsSet() {
|
||||
public void updatePopularDomainsSet() {
|
||||
var entry = rankingSettings.retro;
|
||||
|
||||
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||
retroSet.write();
|
||||
popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, popularSet.source, data);
|
||||
popularSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -12,6 +12,7 @@ include 'code:services-application:dating-service'
|
||||
include 'code:services-application:explorer-service'
|
||||
|
||||
include 'code:libraries:array'
|
||||
include 'code:libraries:geo-ip'
|
||||
include 'code:libraries:btree'
|
||||
include 'code:libraries:easy-lsh'
|
||||
include 'code:libraries:guarded-regex'
|
||||
|
Loading…
Reference in New Issue
Block a user