diff --git a/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java index eaa9f0f5..aa263599 100644 --- a/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java +++ b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java @@ -1,12 +1,14 @@ package nu.marginalia.assistant.client; +import com.google.gson.reflect.TypeToken; import com.google.inject.Inject; import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; import nu.marginalia.assistant.client.model.DictionaryResponse; +import nu.marginalia.assistant.client.model.DomainInformation; +import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.client.exception.RouteNotConfiguredException; -import nu.marginalia.WmsaHome; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; @@ -14,6 +16,7 @@ import nu.marginalia.client.Context; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.List; @Singleton @@ -59,4 +62,31 @@ public class AssistantClient extends AbstractDynamicClient { return Observable.empty(); } } + + public Observable> similarDomains(Context ctx, int domainId, int count) { + try { + return super.get(ctx, 0, STR."/domain/\{domainId}/similar?count=\{count}", new TypeToken>() {}); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } + } + + public Observable> linkedDomains(Context ctx, int domainId, int count) { + try { + return super.get(ctx, 0, STR."/domain/\{domainId}/linking?count=\{count}", new TypeToken>() {}); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } + } + + public Observable domainInformation(Context ctx, int domainId) { + try { + return super.get(ctx, 0, STR."/domain/\{domainId}/info", DomainInformation.class); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } + } } diff --git a/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java new file mode 100644 index 00000000..5cf28278 --- /dev/null +++ b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java @@ -0,0 +1,42 @@ +package nu.marginalia.assistant.client.model; + +import lombok.*; +import nu.marginalia.model.EdgeDomain; + +@Getter @AllArgsConstructor @NoArgsConstructor @Builder +@ToString +public class DomainInformation { + EdgeDomain domain; + + boolean blacklisted; + int pagesKnown; + int pagesFetched; + int pagesIndexed; + int incomingLinks; + int outboundLinks; + int nodeAffinity; + double ranking; + + boolean suggestForCrawling; + boolean inCrawlQueue; + boolean unknownDomain; + + String ip; + String ipCountry; + String state; + + public String getIpFlag() { + if (ipCountry == null || ipCountry.isBlank()) { + return ""; + } + String country = ipCountry; + if ("UK".equals(country)) { + return "GB"; + } + int offset = 0x1F1E6; + int asciiOffset = 0x41; + int firstChar = Character.codePointAt(country, 0) - asciiOffset + offset; + int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset; + return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar)); + } +} diff --git a/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/SimilarDomain.java b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/SimilarDomain.java new file mode 100644 index 00000000..ccd7fe5b --- /dev/null +++ b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/SimilarDomain.java @@ -0,0 +1,69 @@ +package nu.marginalia.assistant.client.model; + +import nu.marginalia.model.EdgeUrl; + +public record SimilarDomain(EdgeUrl url, + int domainId, + double relatedness, + double rank, + boolean indexed, + boolean active, + boolean screenshot, + LinkType linkType) { + + public String getRankSymbols() { + if (rank > 90) { + return "★★★★★"; + } + if (rank > 70) { + return "★★★★"; + } + if (rank > 50) { + return "★★★"; + } + if (rank > 30) { + return "★★"; + } + if (rank > 10) { + return "★"; + } + return ""; + } + + public enum LinkType { + BACKWARD, + FOWARD, + BIDIRECTIONAL, + NONE; + + public static LinkType find(boolean linkStod, + boolean linkDtos) { + if (linkDtos && linkStod) + return BIDIRECTIONAL; + if (linkDtos) + return FOWARD; + if (linkStod) + return BACKWARD; + + return NONE; + } + + public String toString() { + return switch (this) { + case FOWARD -> "→"; + case BACKWARD -> "←"; + case BIDIRECTIONAL -> "⇆"; + case NONE -> "-"; + }; + } + + public String getDescription() { + return switch (this) { + case BACKWARD -> "Backward Link"; + case FOWARD -> "Forward Link"; + case BIDIRECTIONAL -> "Mutual Link"; + case NONE -> "No Link"; + }; + } + } +} diff --git a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorRemoteActorFactory.java b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorRemoteActorFactory.java index 4e5cb8cb..ffbe168c 100644 --- a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorRemoteActorFactory.java +++ b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorRemoteActorFactory.java @@ -34,8 +34,23 @@ public class ExecutorRemoteActorFactory { } public interface ExecutorRemoteActorIf { - boolean trigger(T object) throws Exception; - String getState(); + + /** Trigger the remote actor with the given object. The object will be serialized to JSON and sent to the + * remote actor. If the remote actor does not respond after a time period, a timeout will occur and a negative + * message id will be returned. + * + * @param object The message to send to the remote actot + * @return The message id of the response message, or a negative number if the remote actor did not respond + * within a reasonable timeout seconds. + */ + long trigger(T object) throws Exception; + + /** Get the last finished state of the actor. + *

+ * The message id of the request initiating the actor must be provided to ensure that + * we don't get a state from a previous run. + */ + String getState(long fromMsgId); } public record CrawlData(FileStorageId storageId, boolean cascadeLoad) {} @@ -58,11 +73,11 @@ class ExecutorRemoteActor implements ExecutorRemoteActorFactory.ExecutorRemot this.triggerFunction = triggerFunction; } - public boolean trigger(T object) throws Exception { + public long trigger(T object) throws Exception { return trigger(gson.toJson(object)); } - public boolean trigger(String payload) throws Exception { + public long trigger(String payload) throws Exception { long id = persistence.sendNewMessage(inboxName, null, null, triggerFunction, payload, null); // Wait for the remote actor to respond to the message @@ -70,19 +85,19 @@ class ExecutorRemoteActor implements ExecutorRemoteActorFactory.ExecutorRemot for (int i = 0; i < 120; i++) { var msg = persistence.getMessage(id); if (msg.state() == MqMessageState.ACK || msg.state() == MqMessageState.OK) - return true; + return id; if (msg.state() == MqMessageState.ERR || msg.state() == MqMessageState.DEAD) - return false; + return -id; TimeUnit.SECONDS.sleep(1); } - return false; // Timeout + return -1; // Timeout } - public String getState() { + public String getState(long fromMsgId) { return persistence - .getHeadMessage(inboxName) + .getHeadMessage(inboxName, fromMsgId) .map(MqMessage::function) .orElse("INITIAL"); } diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java index e89d6d8b..4eb0eae0 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java @@ -7,7 +7,7 @@ package nu.marginalia.index.client.model.query; * */ public enum SearchSetIdentifier { NONE, - RETRO, + POPULAR, BLOGS, ACADEMIA, SMALLWEB diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java b/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java index f190bfe4..d603a546 100644 --- a/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java +++ b/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java @@ -1,6 +1,5 @@ package nu.marginalia.client; -import com.google.common.util.concurrent.ThreadFactoryBuilder; import io.reactivex.rxjava3.core.Scheduler; import io.reactivex.rxjava3.schedulers.Schedulers; import org.slf4j.Logger; @@ -10,26 +9,16 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.ThreadFactory; public class AbortingScheduler { - private final ThreadFactory threadFactory; - private final Logger logger = LoggerFactory.getLogger(getClass()); @Nullable private ExecutorService executorService; - public AbortingScheduler(String name) { - threadFactory = new ThreadFactoryBuilder() - .setNameFormat(name+"client--%d") - .setUncaughtExceptionHandler(this::handleException) - .build(); + public AbortingScheduler() { } - private void handleException(Thread thread, Throwable throwable) { - logger.error("Uncaught exception during Client IO in thread {}", thread.getName(), throwable); - } public synchronized Scheduler get() { return Schedulers.from(getExecutorService(), @@ -40,14 +29,14 @@ public class AbortingScheduler { public synchronized void abort() { if (null != executorService) { executorService.shutdownNow(); - executorService = Executors.newFixedThreadPool(16, threadFactory); + executorService = Executors.newVirtualThreadPerTaskExecutor(); } } @Nonnull private synchronized ExecutorService getExecutorService() { if (null == executorService) { - executorService = Executors.newFixedThreadPool(16, threadFactory); + executorService = Executors.newVirtualThreadPerTaskExecutor(); } return executorService; } diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java b/code/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java index 92e1ff0c..697671f0 100644 --- a/code/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java +++ b/code/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java @@ -1,6 +1,7 @@ package nu.marginalia.client; import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; import com.google.protobuf.GeneratedMessageV3; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.core.ObservableSource; @@ -20,6 +21,7 @@ import org.slf4j.LoggerFactory; import spark.utils.IOUtils; import java.io.OutputStream; +import java.lang.reflect.Type; import java.net.ConnectException; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; @@ -233,6 +235,22 @@ public abstract class AbstractClient implements AutoCloseable { .doFinally(() -> ThreadContext.remove("outbound-request")); } + protected synchronized Observable get(Context ctx, int node, String endpoint, TypeToken type) { + ensureAlive(node); + + var req = ctx.paint(new Request.Builder()).url(serviceRoutes.get(node) + endpoint).get().build(); + + return Observable.just(client.newCall(req)) + .subscribeOn(scheduler().get()) + .map(this::logInbound) + .map(Call::execute) + .map(this::logOutbound) + .map(rsp -> validateResponseStatus(rsp, req, 200)) + .map(rsp -> getEntity(rsp, type)) + .retryWhen(this::retryHandler) + .timeout(timeout, TimeUnit.SECONDS) + .doFinally(() -> ThreadContext.remove("outbound-request")); + } protected synchronized Observable get(Context ctx, int node, String endpoint, OutputStream outputStream) { ensureAlive(node); @@ -388,6 +406,15 @@ public abstract class AbstractClient implements AutoCloseable { } } @SneakyThrows + private T getEntity(Response response, TypeToken clazz) { + try (response) { + return gson.fromJson(response.body().charStream(), clazz); + } + catch (Exception ex) { + throw ex; + } + } + @SneakyThrows private String getText(Response response) { try (response) { return response.body().string(); diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java b/code/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java index 6e735010..5d2e3ef6 100644 --- a/code/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java +++ b/code/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java @@ -1,8 +1,6 @@ package nu.marginalia.client; import com.google.gson.Gson; -import nu.marginalia.client.route.RouteProvider; -import nu.marginalia.client.route.ServiceRoute; import nu.marginalia.service.descriptor.ServiceDescriptor; import javax.annotation.Nonnull; @@ -20,7 +18,7 @@ public class AbstractDynamicClient extends AbstractClient { ); this.service = service; - this.scheduler = new AbortingScheduler(name()); + this.scheduler = new AbortingScheduler(); } @Override diff --git a/code/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java b/code/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java index 44445204..2f02a8a7 100644 --- a/code/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java +++ b/code/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java @@ -42,7 +42,7 @@ public class AbstractClientTest { client = new AbstractClient(new RouteProvider(new ServiceDescriptor(ServiceId.Api, "localhost")), 1, Gson::new) { @Override public AbortingScheduler scheduler() { - return new AbortingScheduler(name()); + return new AbortingScheduler(); } @Override diff --git a/code/features-crawl/crawl-blocklist/build.gradle b/code/features-crawl/crawl-blocklist/build.gradle index c131e97b..8288aa0c 100644 --- a/code/features-crawl/crawl-blocklist/build.gradle +++ b/code/features-crawl/crawl-blocklist/build.gradle @@ -15,6 +15,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:libraries:guarded-regex') + implementation project(':code:libraries:geo-ip') implementation libs.notnull diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java index f91ab135..79ca6847 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java @@ -1,72 +1,31 @@ package nu.marginalia.ip_blocklist; +import com.google.inject.Inject; import com.google.inject.Singleton; -import com.opencsv.CSVReader; -import com.opencsv.exceptions.CsvValidationException; -import lombok.AllArgsConstructor; -import nu.marginalia.WmsaHome; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileReader; -import java.io.IOException; -import java.net.InetAddress; import java.util.Set; -import java.util.TreeMap; @Singleton public class GeoIpBlocklist { - private final TreeMap ranges = new TreeMap<>(); + /** These countries are extremely overrepresented among the problematic and spammy domains, + * and blocking them is by far the most effective spam mitigation technique. Sucks we throw + * babies out with the bathwater, but it's undeniably effective. + */ private final Set blacklist = Set.of("CN", "HK"); private final Set graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA"); private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class); - @AllArgsConstructor - static class IpRange { - public final long from; - public final long to; - public final String country; - } + private final GeoIpDictionary ipDictionary; - public GeoIpBlocklist() throws IOException, CsvValidationException { - var resource = WmsaHome.getIPLocationDatabse(); - - try (var reader = new CSVReader(new FileReader(resource.toFile()))) { - for (;;) { - String[] vals = reader.readNext(); - if (vals == null) { - break; - } - if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) { - continue; - } - var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]), - Long.parseLong(vals[1]), - vals[2]); - ranges.put(range.from, range); - } - } - - logger.info("Loaded {} IP ranges", ranges.size()); - } - - public String getCountry(InetAddress address) { - byte[] bytes = address.getAddress(); - long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF); - - Long key = ranges.floorKey(ival); - if (null == key) { - return "-"; - } - - var range = ranges.get(key); - if (ival >= key && ival < range.to) { - return range.country; - } - - return "-"; + @Inject + public GeoIpBlocklist(GeoIpDictionary ipDictionary) { + this.ipDictionary = ipDictionary; + ipDictionary.waitReady(); } public boolean isAllowed(EdgeDomain domain) { @@ -84,7 +43,7 @@ public class GeoIpBlocklist { public String getCountry(EdgeDomain domain) { try { - return getCountry(InetAddressCache.getAddress(domain)); + return ipDictionary.getCountry(InetAddressCache.getAddress(domain)); } catch (Throwable ex) { logger.debug("Failed to resolve {}", domain); diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java index 728a1f65..ba9a7948 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java @@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit; // We don't want to torture the DNS by resolving the same links over and over and over again public class InetAddressCache { - private static final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); + private static final Cache cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); public static InetAddress getAddress(EdgeDomain domain) throws Throwable { try { return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress())); diff --git a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java new file mode 100644 index 00000000..15dbc087 --- /dev/null +++ b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java @@ -0,0 +1,139 @@ +package nu.marginalia.util; + +import lombok.SneakyThrows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Consumer; + +/** + * Abstraction for exposing a (typically) read-from-disk -> parallel processing -> sequential output + * workflow as an iterator, where the number of tasks is much larger than the number of cores + */ +public class ProcessingIterator implements Iterator { + private static final Logger logger = LoggerFactory.getLogger(ProcessingIterator.class); + + private final LinkedBlockingQueue queue; + private final AtomicBoolean isFinished = new AtomicBoolean(false); + private final ExecutorService executorService; + private final Semaphore sem; + + private T next = null; + + private final int parallelism; + + public ProcessingIterator(int queueSize, int parallelism, ProcessingJob task) { + this.parallelism = parallelism; + + queue = new LinkedBlockingQueue<>(queueSize); + executorService = Executors.newFixedThreadPool(parallelism); + sem = new Semaphore(parallelism); + + executorService.submit(() -> executeJob(task)); + } + + private void executeJob(ProcessingJob job) { + try { + job.run(this::executeTask); + } catch (Exception e) { + logger.warn("Exception while processing", e); + } finally { + isFinished.set(true); + } + } + + private void executeTask(Task task) { + try { + sem.acquire(); + } catch (InterruptedException e) { + return; + } + + try { + queue.put(task.get()); + } catch (Exception e) { + logger.warn("Exception while processing", e); + } finally { + sem.release(); + } + } + + /** Returns true if there are more documents to be processed. + * This method may block until we are certain this is true. + *

+ * This method must be invoked from the same thread that invokes next(), + * (or synchronize between the two) + */ + @Override + @SneakyThrows + public boolean hasNext() { + if (next != null) + return true; + + do { + next = queue.poll(1, TimeUnit.SECONDS); + if (next != null) { + return true; + } + } while (expectMore()); + + if (!executorService.isShutdown()) { + executorService.shutdown(); + } + + return false; + } + + /** Heuristic for if we should expect more documents to be processed, + * _trust but verify_ since we don't run this in an exclusive section + * and may get a false positive. We never expect a false negative though. + */ + private boolean expectMore() { + return !isFinished.get() // we are still reading from the database + || !queue.isEmpty() // ... or we have documents in the queue + || sem.availablePermits() < parallelism; // ... or we are still processing documents + } + + /** Returns the next document to be processed. + * This method may block until we are certain there is a document to be processed. + *

+ * This method must be invoked from the same thread that invokes hasNext(), + * (or synchronize between the two) + *

+ * If this is run after hasNext() returns false, a NoSuchElementException is thrown. + */ + @SneakyThrows + @Override + public T next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + try { + return next; + } + finally { + next = null; + } + } + + /** + * A job that produces a sequence of processing tasks that are to be + * performed in parallel + */ + public interface ProcessingJob { + void run(Consumer> output) throws Exception; + } + + /** + * A single task that produces a result to be iterable via the Iterator interface + * (along with other tasks' outputs) + */ + public interface Task { + T get() throws Exception; + } +} diff --git a/code/libraries/geo-ip/build.gradle b/code/libraries/geo-ip/build.gradle new file mode 100644 index 00000000..b0180ef8 --- /dev/null +++ b/code/libraries/geo-ip/build.gradle @@ -0,0 +1,24 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation project(':code:common:config') + + implementation libs.bundles.slf4j + implementation libs.opencsv + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} diff --git a/code/libraries/geo-ip/readme.md b/code/libraries/geo-ip/readme.md new file mode 100644 index 00000000..2a81b04b --- /dev/null +++ b/code/libraries/geo-ip/readme.md @@ -0,0 +1,6 @@ +This micro library handles the GeoIP lookups, mappings from IP addresses +to country codes. + +It uses the free ip2location lite database, which is +available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country) +under a CC-BY-SA 4.0 license. \ No newline at end of file diff --git a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java new file mode 100644 index 00000000..13b982f5 --- /dev/null +++ b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java @@ -0,0 +1,90 @@ +package nu.marginalia.geoip; + +import com.opencsv.CSVReader; +import nu.marginalia.WmsaHome; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileReader; +import java.net.InetAddress; +import java.util.TreeMap; + +public class GeoIpDictionary { + private volatile TreeMap ranges = null; + private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class); + + record IpRange(long from, long to, String country) {} + + public GeoIpDictionary() { + Thread.ofPlatform().start(() -> { + try (var reader = new CSVReader(new FileReader(WmsaHome.getIPLocationDatabse().toFile()))) { + var dict = new TreeMap(); + + for (;;) { + String[] vals = reader.readNext(); + if (vals == null) { + break; + } + var range = new IpRange(Long.parseLong(vals[0]), + Long.parseLong(vals[1]), + vals[2]); + dict.put(range.from, range); + } + ranges = dict; + logger.info("Loaded {} IP ranges", ranges.size()); + } catch (Exception e) { + ranges = new TreeMap<>(); + throw new RuntimeException(e); + } + finally { + this.notifyAll(); + } + }); + } + + public boolean isReady() { + return null != ranges; + } + + public boolean waitReady() { + while (null == ranges) { + try { + synchronized (this) { + this.wait(1000); + } + } catch (InterruptedException e) { + return false; + } + } + return true; + } + + public String getCountry(String ip) { + try { + return getCountry(InetAddress.getByName(ip)); + } catch (Exception e) { + return ""; + } + } + + public String getCountry(InetAddress address) { + if (null == ranges) { // not loaded yet or failed to load + return ""; + } + + byte[] bytes = address.getAddress(); + long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF); + + Long key = ranges.floorKey(ival); + if (null == key) { + return ""; + } + + var range = ranges.get(key); + if (ival >= key && ival < range.to) { + return range.country; + } + + return ""; + } +} diff --git a/code/libraries/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/libraries/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 8b4e9ac5..732aa70c 100644 --- a/code/libraries/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/libraries/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -496,17 +496,21 @@ public class MqPersistence { return gson; } - /** Returns the last message sent to this inbox with a state of 'OK' */ - public Optional getHeadMessage(String inboxName) { + /** Returns the last message sent to this inbox with a state of 'OK' + * with an id greater than or equal to fromMsgId + */ + public Optional getHeadMessage(String inboxName, long fromMsgId) { try (var conn = dataSource.getConnection(); var query = conn.prepareStatement(""" SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM MESSAGE_QUEUE - WHERE RECIPIENT_INBOX = ? AND STATE='OK' + WHERE RECIPIENT_INBOX = ? AND STATE='OK' AND ID >= ? ORDER BY ID DESC LIMIT 1 """)) { query.setString(1, inboxName); + query.setLong(2, fromMsgId); + var rs = query.executeQuery(); if (rs.next()) { long msgId = rs.getLong(1); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 94d13235..143c775b 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -4,6 +4,7 @@ import lombok.AllArgsConstructor; import lombok.Builder; import lombok.ToString; import nu.marginalia.bigstring.BigString; +import nu.marginalia.model.EdgeUrl; @Builder @AllArgsConstructor @@ -35,4 +36,16 @@ public class CrawledDocument implements SerializableCrawlData { return SERIAL_IDENTIFIER; } + @Override + public String getDomain() { + if (url == null) + return null; + + return EdgeUrl + .parse(url) + .map(EdgeUrl::getDomain) + .map(d -> d.domain) + .orElse(null); + } + } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java index c9804d54..48b3f65d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java @@ -2,4 +2,5 @@ package nu.marginalia.crawling.model; public interface SerializableCrawlData { String getSerialIdentifier(); + String getDomain(); } diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java index a31b199d..a0714557 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java @@ -3,6 +3,7 @@ package nu.marginalia.io.processed; import blue.strategic.parquet.HydratorSupplier; import blue.strategic.parquet.ParquetReader; import nu.marginalia.model.processed.DomainRecord; +import nu.marginalia.model.processed.DomainWithIp; import org.jetbrains.annotations.NotNull; import java.io.IOException; @@ -19,10 +20,10 @@ public class DomainRecordParquetFileReader { } @NotNull - public static List getDomainNames(Path path) throws IOException { + public static List getBasicDomainInformation(Path path) throws IOException { return ParquetReader.streamContent(path.toFile(), HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()), - List.of("domain")) + List.of("domain", "ip")) .toList(); } diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java index e3a0c9f9..b696829f 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java @@ -8,7 +8,6 @@ import org.apache.parquet.schema.*; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import java.sql.Array; import java.util.ArrayList; import java.util.List; @@ -48,8 +47,8 @@ public class DomainRecord { return DomainRecord::dehydrate; } - public static Hydrator newDomainNameHydrator() { - return new DomainNameHydrator(); + public static Hydrator newDomainNameHydrator() { + return new DomainWithIpHydrator(); } @@ -124,23 +123,26 @@ class DomainHydrator implements Hydrator { } } -class DomainNameHydrator implements Hydrator { +class DomainWithIpHydrator implements Hydrator { @Override - public String start() { - return ""; + public DomainWithIp start() { + return new DomainWithIp(); } @Override - public String add(String target, String heading, Object value) { + public DomainWithIp add(DomainWithIp target, String heading, Object value) { if ("domain".equals(heading)) { - return (String) value; + target.domain = (String) value; + } + else if ("ip".equals(heading)) { + target.ip = (String) value; } return target; } @Override - public String finish(String target) { + public DomainWithIp finish(DomainWithIp target) { return target; } } \ No newline at end of file diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java new file mode 100644 index 00000000..3782b1b2 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java @@ -0,0 +1,15 @@ +package nu.marginalia.model.processed; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.ToString; + +@AllArgsConstructor +@NoArgsConstructor +@EqualsAndHashCode +@ToString +public class DomainWithIp { + public String domain; + public String ip; +} diff --git a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java index 7c73f13e..b1867100 100644 --- a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java +++ b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java @@ -1,6 +1,7 @@ package nu.marginalia.io.processed; import nu.marginalia.model.processed.DomainRecord; +import nu.marginalia.model.processed.DomainWithIp; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -53,8 +54,11 @@ class DomainRecordParquetFileReaderTest { writer.write(second); } - var domainNames = DomainRecordParquetFileReader.getDomainNames(parquetFile); - assertEquals(List.of("www.marginalia.nu", "memex.marginalia.nu"), domainNames); + var domainInfo = DomainRecordParquetFileReader.getBasicDomainInformation(parquetFile); + assertEquals(List.of( + new DomainWithIp("www.marginalia.nu", "127.0.0.1"), + new DomainWithIp("memex.marginalia.nu", "127.0.0.1")), + domainInfo); var items = DomainRecordParquetFileReader .stream(parquetFile) diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 4a3f2290..556f8015 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -41,6 +41,7 @@ dependencies { implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:easy-lsh') + implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:big-string') implementation project(':code:libraries:language-processing') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 50f29fb1..3bada914 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -138,10 +138,16 @@ public class ConverterMain { for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id))) { pool.submit(() -> { - ProcessedDomain processed = processor.process(domain); - converterWriter.accept(processed); - - heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + try { + ProcessedDomain processed = processor.process(domain); + converterWriter.accept(processed); + } + catch (Exception ex) { + logger.info("Error in processing", ex); + } + finally { + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + } }); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f5effebd..fc824906 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.*; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; @@ -22,6 +23,7 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; +import java.util.regex.Pattern; public class DomainProcessor { private final DocumentProcessor documentProcessor; @@ -29,6 +31,7 @@ public class DomainProcessor { private final AnchorTagsSource anchorTagsSource; private final AnchorTextKeywords anchorTextKeywords; private final LshDocumentDeduplicator documentDeduplicator; + private final GeoIpDictionary geoIpDictionary; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -37,13 +40,16 @@ public class DomainProcessor { SiteWords siteWords, AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTextKeywords anchorTextKeywords, - LshDocumentDeduplicator documentDeduplicator) throws SQLException + LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException { this.documentProcessor = documentProcessor; this.siteWords = siteWords; this.anchorTextKeywords = anchorTextKeywords; this.documentDeduplicator = documentDeduplicator; this.anchorTagsSource = anchorTagsSourceFactory.create(); + this.geoIpDictionary = geoIpDictionary; + + geoIpDictionary.waitReady(); } @SneakyThrows @@ -54,11 +60,21 @@ public class DomainProcessor { boolean cookies = false; String ip = ""; - DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain); + DomainLinks externalDomainLinks = null; while (dataStream.hasNext()) { var data = dataStream.next(); + // Do a lazy load of the external domain links since we don't know the domain + // until we see the first document + if (externalDomainLinks == null) { + var domain = data.getDomain(); + + if (domain != null) { + externalDomainLinks = anchorTagsSource.getAnchorTags(domain); + } + } + if (data instanceof CrawledDomain crawledDomain) { ret.domain = new EdgeDomain(crawledDomain.domain); ret.ip = crawledDomain.ip; @@ -76,8 +92,15 @@ public class DomainProcessor { try { if (doc.url == null) continue; + fixBadCanonicalTag(doc); + // This case should never be reachable, as we should have initiated + // the externalDomainLinks variable above if we made it past the + // doc.url == null check; but we'll leave it here just in case + // to make debugging easier if we break this. + assert externalDomainLinks != null : "externalDomainLinks has not been initialized"; + docs.add(documentProcessor.process(doc, externalDomainLinks)); } catch (Exception ex) { @@ -89,11 +112,22 @@ public class DomainProcessor { // Add late keywords and features from domain-level information List terms = new ArrayList<>(); + terms.add("ip:"+ip); + + String ipCountryCode = geoIpDictionary.getCountry(ip).toLowerCase(); + if (!ipCountryCode.isBlank()) { + terms.add("ip:"+ipCountryCode); + } + if (cookies) { terms.add(HtmlFeature.COOKIES.getKeyword()); } + if (isAcademicDomain(ret.domain)) { + terms.add("special:academia"); + } + for (var document : ret.documents) { if (document.details == null) continue; @@ -114,6 +148,19 @@ public class DomainProcessor { return ret; } + + private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$"); + private boolean isAcademicDomain(EdgeDomain domain) { + + if (domain.domain.endsWith(".edu")) + return true; + + if (academicPattern.matcher(domain.domain).matches()) + return true; + + return false; + } + private void fixBadCanonicalTag(CrawledDocument doc) { // Some sites have a canonical tag that points to a different domain, // but our loader can not support this, so we point these back to the diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index 48ab45c9..808d4224 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload; import com.google.gson.Gson; import com.google.inject.Inject; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory; import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader; @@ -21,6 +22,7 @@ public class SideloadSourceFactory { private final SideloaderProcessing sideloaderProcessing; private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final DocumentKeywordExtractor documentKeywordExtractor; + private final AnchorTextKeywords anchorTextKeywords; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final DirtreeSideloaderFactory dirtreeSideloaderFactory; private final WarcSideloadFactory warcSideloadFactory; @@ -29,7 +31,7 @@ public class SideloadSourceFactory { public SideloadSourceFactory(Gson gson, SideloaderProcessing sideloaderProcessing, ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, - DocumentKeywordExtractor documentKeywordExtractor, + DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords, AnchorTagsSourceFactory anchorTagsSourceFactory, DirtreeSideloaderFactory dirtreeSideloaderFactory, WarcSideloadFactory warcSideloadFactory) { @@ -37,13 +39,14 @@ public class SideloadSourceFactory { this.sideloaderProcessing = sideloaderProcessing; this.sentenceExtractorProvider = sentenceExtractorProvider; this.documentKeywordExtractor = documentKeywordExtractor; + this.anchorTextKeywords = anchorTextKeywords; this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.dirtreeSideloaderFactory = dirtreeSideloaderFactory; this.warcSideloadFactory = warcSideloadFactory; } public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException { - return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing); + return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing); } public Collection sideloadDirtree(Path pathToYamlFile) throws IOException { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index aab62ef9..204aa6a8 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.encyclopedia; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import lombok.SneakyThrows; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.DisqualifiedException; @@ -14,6 +15,7 @@ import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.util.ProcessingIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,15 +29,10 @@ import java.nio.file.Path; import java.sql.*; import java.util.Iterator; import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.Semaphore; -import java.util.concurrent.atomic.AtomicBoolean; /** This is an experimental sideloader for encyclopedia.marginalia.nu's database; * (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code) - * + *

* See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data */ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable { @@ -43,6 +40,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC private final Connection connection; private final EdgeUrl baseUrl; private final Gson gson; + private final AnchorTextKeywords anchorTextKeywords; private final SideloaderProcessing sideloaderProcessing; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class); @@ -51,9 +49,11 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC String baseUrl, Gson gson, AnchorTagsSourceFactory anchorTagsSourceFactory, + AnchorTextKeywords anchorTextKeywords, SideloaderProcessing sideloaderProcessing) throws SQLException { this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new); this.gson = gson; + this.anchorTextKeywords = anchorTextKeywords; this.sideloaderProcessing = sideloaderProcessing; String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString(); @@ -76,62 +76,24 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC @SneakyThrows @Override public Iterator getDocumentsStream() { - LinkedBlockingQueue docs = new LinkedBlockingQueue<>(32); - AtomicBoolean isFinished = new AtomicBoolean(false); + return new ProcessingIterator<>(24, 16, (taskConsumer) -> { + DomainLinks domainLinks = getDomainLinks(); - ExecutorService executorService = Executors.newFixedThreadPool(16); - Semaphore sem = new Semaphore(16); + var stmt = connection.prepareStatement(""" + SELECT url,title,html FROM articles + """); + stmt.setFetchSize(100); - DomainLinks domainLinks = getDomainLinks(); + var rs = stmt.executeQuery(); - executorService.submit(() -> { - try { - var stmt = connection.prepareStatement(""" - SELECT url,title,html FROM articles - """); - stmt.setFetchSize(100); + while (rs.next()) { + var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); + String title = rs.getString("title"); + String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); - var rs = stmt.executeQuery(); - while (rs.next()) { - var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); - String title = rs.getString("title"); - String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); - - sem.acquire(); - - executorService.submit(() -> { - try { - docs.add(convertDocument(articleParts.parts, title, url, domainLinks)); - } catch (URISyntaxException | DisqualifiedException e) { - e.printStackTrace(); - } finally { - sem.release(); - } - }); - } - - stmt.close(); - } - catch (Exception e) { - e.printStackTrace(); - } - finally { - isFinished.set(true); + taskConsumer.accept(() -> convertDocument(articleParts.parts, title, url, domainLinks)); } }); - - return new Iterator<>() { - @Override - public boolean hasNext() { - return !isFinished.get() || !docs.isEmpty() || sem.availablePermits() < 16; - } - - @SneakyThrows - @Override - public ProcessedDocument next() { - return docs.take(); - } - }; } private DomainLinks getDomainLinks() { @@ -142,30 +104,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC logger.error("Failed to create anchor tags source", ex); return new DomainLinks(); } - - } - - ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException { - var stmt = connection.prepareStatement(""" - SELECT url,title,html - FROM articles - WHERE url=? - """); - stmt.setFetchSize(100); - stmt.setString(1, url); - - var rs = stmt.executeQuery(); - if (rs.next()) { - var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); - String title = rs.getString("title"); - - return convertDocument(articleParts.parts, - title, - URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8), - new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data. - ); - } - return null; } private ProcessedDocument convertDocument(List parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException { @@ -180,13 +118,22 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC } fullHtml.append(""); - return sideloaderProcessing + var doc = sideloaderProcessing .processDocument(fullUrl, fullHtml.toString(), List.of("encyclopedia", "wiki"), domainLinks, GeneratorType.WIKI, 10_000_000); + + // Add anchor text keywords + if (doc.isProcessedFully()) { + doc.words.addAnchorTerms( + anchorTextKeywords.getAnchorTextKeywords(domainLinks, doc.url) + ); + } + + return doc; } private T fromCompressedJson(byte[] stream, Class type) throws IOException { diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 7150b1e0..5b5deddc 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -6,6 +6,7 @@ import lombok.SneakyThrows; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; @@ -78,7 +79,7 @@ public class CrawlingThenConvertingIntegrationTest { List data = new ArrayList<>(); try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch(); } CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java index 0b1b6904..fd4aa73a 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java @@ -1,38 +1,12 @@ package nu.marginalia.converting.sideload.encyclopedia; -import com.google.inject.AbstractModule; -import com.google.inject.Guice; -import gnu.trove.list.array.TLongArrayList; -import nu.marginalia.atags.source.AnchorTagsSourceFactory; -import nu.marginalia.atags.model.DomainLinks; -import nu.marginalia.converting.ConverterModule; -import nu.marginalia.converting.model.DisqualifiedException; -import nu.marginalia.converting.processor.ConverterDomainTypes; -import nu.marginalia.converting.sideload.SideloaderProcessing; -import nu.marginalia.io.processed.DocumentRecordParquetFileReader; -import nu.marginalia.io.processed.DocumentRecordParquetFileWriter; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.processed.DocumentRecord; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.mockito.Mockito; import java.io.IOException; -import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; -import java.sql.SQLException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static org.junit.jupiter.api.Assertions.*; -import static org.mockito.Mockito.when; class EncyclopediaMarginaliaNuSideloaderTest { Path tempFile; @@ -62,93 +36,5 @@ class EncyclopediaMarginaliaNuSideloaderTest { System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L))); System.out.printf("%64s\n", Long.toBinaryString(0x10L)); } - @Test - public void debugSpecificArticle() throws SQLException, IOException, URISyntaxException, DisqualifiedException { - Path pathToDbFile = Path.of("/home/vlofgren/Code/MarginaliaSearch/run/samples/articles.db"); - if (!Files.exists(pathToDbFile)) { - // not really practical to ship a 40 Gb sqlite files on github - // be @vlofgren to run this test - return; - } - var domainTypesMock = Mockito.mock(ConverterDomainTypes.class); - when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false); - var processing = Guice.createInjector(new ConverterModule(), - new AbstractModule() { - public void configure() { - bind(ConverterDomainTypes.class).toInstance(domainTypesMock); - } - } - ) - .getInstance(SideloaderProcessing.class); - var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class); - when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks()); - - var sideloader = new EncyclopediaMarginaliaNuSideloader( - pathToDbFile, - "https://en.wikipedia.org/wiki/", - GsonFactory.get(), - atagsFactory, - processing - ); - - var document = sideloader.processJust("Don't_Tell_Me_(Madonna_song)"); - - System.out.println(document); - - var keywordsBuilt = document.words.build(); - - var ptr = keywordsBuilt.newPointer(); - - Map dirtyAndBlues = new HashMap<>(); - - while (ptr.advancePointer()) { - String word = ptr.getKeyword(); - - System.out.println(word + ": " + Long.toHexString(Long.reverseBytes(ptr.getMetadata()))); - - if (Set.of("dirty", "blues").contains(word)) { - WordMetadata meta = new WordMetadata(ptr.getMetadata()); - - Assertions.assertNull( - dirtyAndBlues.put(word, meta) - ); - } - } - - Assertions.assertTrue(dirtyAndBlues.containsKey("dirty")); - Assertions.assertTrue(dirtyAndBlues.containsKey("blues")); - Assertions.assertNotEquals( - dirtyAndBlues.get("dirty"), - dirtyAndBlues.get("blues") - ); - - try (var dw = new DocumentRecordParquetFileWriter(tempFile)) { - dw.write(new DocumentRecord( - "encyclopedia.marginalia.nu", - document.url.toString(), - 0, - document.state.toString(), - document.stateReason, - document.details.title, - document.details.description, - HtmlFeature.encode(document.details.features), - document.details.standard.name(), - document.details.length, - document.details.hashCode, - (float) document.details.quality, - document.details.metadata.encode(), - document.details.pubYear, - List.of(keywordsBuilt.keywords), - new TLongArrayList(keywordsBuilt.metadata) - )); - } - - var record = DocumentRecordParquetFileReader.streamKeywordsProjection(tempFile).findFirst().get(); - String[] words = record.words.toArray(String[]::new); - long[] meta = record.metas.toArray(); - - assertArrayEquals(keywordsBuilt.keywords, words); - assertArrayEquals(keywordsBuilt.metadata, meta); - } } \ No newline at end of file diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java new file mode 100644 index 00000000..d20b7ddf --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java @@ -0,0 +1,38 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Test; + +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class ProcessingIteratorTest { + + @Test + public void test() { + Set output = new HashSet<>(); + var iter = new ProcessingIterator(2, 2, q -> { + for (int i = 0; i < 10_000; i++) { + int j = i; + q.accept(() -> task(j)); + } + }); + while (iter.hasNext()) { + output.add(iter.next()); + } + + assertEquals(10_000, output.size()); + + for (int i = 0; i < 10_000; i++) { + assertTrue(output.contains(i)); + } + } + + int task(int n) throws InterruptedException { + TimeUnit.NANOSECONDS.sleep(10); + return n; + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 0a4fe32e..b3a9d26a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.spec.CrawlSpecProvider; @@ -51,6 +52,7 @@ public class CrawlerMain { private final ProcessHeartbeatImpl heartbeat; private final MessageQueueFactory messageQueueFactory; + private final DomainProber domainProber; private final FileStorageService fileStorageService; private final DbCrawlSpecProvider dbCrawlSpecProvider; private final AnchorTagsSourceFactory anchorTagsSourceFactory; @@ -70,7 +72,7 @@ public class CrawlerMain { @Inject public CrawlerMain(UserAgent userAgent, ProcessHeartbeatImpl heartbeat, - MessageQueueFactory messageQueueFactory, + MessageQueueFactory messageQueueFactory, DomainProber domainProber, FileStorageService fileStorageService, ProcessConfiguration processConfiguration, DbCrawlSpecProvider dbCrawlSpecProvider, @@ -78,6 +80,7 @@ public class CrawlerMain { Gson gson) { this.heartbeat = heartbeat; this.messageQueueFactory = messageQueueFactory; + this.domainProber = domainProber; this.fileStorageService = fileStorageService; this.dbCrawlSpecProvider = dbCrawlSpecProvider; this.anchorTagsSourceFactory = anchorTagsSourceFactory; @@ -211,14 +214,13 @@ public class CrawlerMain { try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id); var warcRecorder = new WarcRecorder(); // write to a temp file for now - var retreiver = new CrawlerRetreiver(fetcher, specification, warcRecorder, writer::accept); + var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder, writer::accept); CrawlDataReference reference = getReference()) { Thread.currentThread().setName("crawling:" + domain); var domainLinks = anchorTagsSource.getAnchorTags(domain); - int size = retreiver.fetch(domainLinks, reference); workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 22fcaa15..bb4991b9 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -45,7 +45,7 @@ public class CrawlerRetreiver implements AutoCloseable { private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); - private static final DomainProber domainProber = new DomainProber(); + private final DomainProber domainProber; private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; private final WarcRecorder warcRecorder; @@ -59,12 +59,14 @@ public class CrawlerRetreiver implements AutoCloseable { private static final String documentWasSameTag = "SAME-BY-COMPARISON"; public CrawlerRetreiver(HttpFetcher fetcher, + DomainProber domainProber, CrawlSpecRecord specs, WarcRecorder warcRecorder, Consumer writer) { this.warcRecorder = warcRecorder; this.fetcher = fetcher; + this.domainProber = domainProber; domain = specs.domain; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java index 67f006d4..fcc005a8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java @@ -1,5 +1,7 @@ package nu.marginalia.crawl.retreival; +import com.google.inject.Inject; +import com.google.inject.Singleton; import nu.marginalia.crawl.retreival.fetcher.FetchResultState; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawling.model.CrawlerDomainStatus; @@ -11,17 +13,21 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.util.function.Predicate; +@Singleton public class DomainProber { private final Logger logger = LoggerFactory.getLogger(DomainProber.class); - private static IpBlockList ipBlockList; + private final Predicate domainBlacklist; - static { - try { - ipBlockList = new IpBlockList(new GeoIpBlocklist()); - } catch (Exception e) { - throw new RuntimeException(e); - } + @Inject + public DomainProber(IpBlockList ipBlockList) { + this.domainBlacklist = ipBlockList::isAllowed; + } + + /** For testing */ + public DomainProber(Predicate domainBlacklist) { + this.domainBlacklist = domainBlacklist; } /** To detect problems early we do a probing request to the domain before we start crawling it properly. @@ -37,7 +43,7 @@ public class DomainProber { return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs"); } - if (!ipBlockList.isAllowed(firstUrlInQueue.domain)) + if (!domainBlacklist.test(firstUrlInQueue.domain)) return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed"); var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null)); @@ -62,7 +68,7 @@ public class DomainProber { /** This domain redirects to another domain */ public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {} - /** If the retreivala of the probed url was successful, return the url as it was fetched + /** If the retrieval of the probed url was successful, return the url as it was fetched * (which may be different from the url we probed, if we attempted another URL schema). * * @param probedUrl The url we successfully probed diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 4d985b8b..8ff9dd12 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -90,7 +90,10 @@ public class HttpFetcherImpl implements HttpFetcher { } @Inject - public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) { + public HttpFetcherImpl(@Named("user-agent") String userAgent, + Dispatcher dispatcher, + ConnectionPool connectionPool) + { this.client = createClient(dispatcher, connectionPool); this.userAgent = userAgent; this.contentTypeProber = new ContentTypeProber(userAgent, client); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 9a974713..e5264301 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.*; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.model.CrawledDocument; @@ -65,7 +66,7 @@ public class CrawlerMockFetcherTest { void crawl(CrawlSpecRecord spec, Consumer consumer) throws IOException { try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(fetcherMock, spec, recorder, consumer) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder, consumer) .fetch(); } } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 3b58d50f..59bf99f6 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; @@ -59,7 +60,7 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); try (var recorder = new WarcRecorder(tempFile)) { - new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch(); } catch (IOException ex) { Assertions.fail(ex); } @@ -103,7 +104,7 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch(); } catch (IOException ex) { Assertions.fail(ex); @@ -137,7 +138,7 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch(); } catch (IOException ex) { Assertions.fail(ex); @@ -178,7 +179,7 @@ class CrawlerRetreiverTest { Map, List> data = new HashMap<>(); try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, specs, recorder, d -> { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> { data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); @@ -202,7 +203,7 @@ class CrawlerRetreiverTest { CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, specs, recorder, d -> { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java index 3ce30d96..911c976d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java @@ -9,6 +9,7 @@ import nu.marginalia.io.processed.DomainRecordParquetFileReader; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.processed.DomainRecord; +import nu.marginalia.model.processed.DomainWithIp; import nu.marginalia.process.control.ProcessHeartbeatImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -51,9 +52,9 @@ public class DomainLoaderService { ) { try (var inserter = new DomainInserter(conn, nodeId)) { - for (var domain : readSetDomainNames(inputData)) { - inserter.accept(new EdgeDomain(domain)); - domainNamesAll.add(domain); + for (var domainWithIp : readBasicDomainInformation(inputData)) { + inserter.accept(new EdgeDomain(domainWithIp.domain)); + domainNamesAll.add(domainWithIp.domain); } } try (var inserter = new DomainInserter(conn, -1)) { @@ -63,9 +64,9 @@ public class DomainLoaderService { } } - try (var updater = new DomainAffinityUpdater(conn, nodeId)) { - for (var domain : readSetDomainNames(inputData)) { - updater.accept(new EdgeDomain(domain)); + try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId)) { + for (var domainWithIp : readBasicDomainInformation(inputData)) { + updater.accept(new EdgeDomain(domainWithIp.domain), domainWithIp.ip); } } @@ -84,15 +85,15 @@ public class DomainLoaderService { return ret; } - Collection readSetDomainNames(LoaderInputData inputData) throws IOException { - final Set domainNamesAll = new HashSet<>(100_000); + Collection readBasicDomainInformation(LoaderInputData inputData) throws IOException { + final Set domainsAll = new HashSet<>(100_000); var domainFiles = inputData.listDomainFiles(); for (var file : domainFiles) { - domainNamesAll.addAll(DomainRecordParquetFileReader.getDomainNames(file)); + domainsAll.addAll(DomainRecordParquetFileReader.getBasicDomainInformation(file)); } - return domainNamesAll; + return domainsAll; } Collection readReferencedDomainNames(LoaderInputData inputData) throws IOException { @@ -164,20 +165,25 @@ public class DomainLoaderService { statement.close(); } } - private static class DomainAffinityUpdater implements AutoCloseable { + private static class DomainAffinityAndIpUpdater implements AutoCloseable { private final PreparedStatement statement; private final int nodeAffinity; private int count = 0; - public DomainAffinityUpdater(Connection connection, int affinity) throws SQLException { + public DomainAffinityAndIpUpdater(Connection connection, int affinity) throws SQLException { this.nodeAffinity = affinity; - statement = connection.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE DOMAIN_NAME=?"); + statement = connection.prepareStatement(""" + UPDATE EC_DOMAIN + SET NODE_AFFINITY = ?, IP = ? + WHERE DOMAIN_NAME=? + """); } - public void accept(EdgeDomain domain) throws SQLException { + public void accept(EdgeDomain domain, String ip) throws SQLException { statement.setInt(1, nodeAffinity); - statement.setString(2, domain.toString()); + statement.setString(2, ip); + statement.setString(3, domain.toString()); statement.addBatch(); if (++count > 1000) { diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java index e199580f..d235fe74 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java @@ -6,7 +6,6 @@ import nu.marginalia.ProcessConfiguration; import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; import nu.marginalia.io.processed.DomainRecordParquetFileWriter; import nu.marginalia.io.processed.ProcessedDataFileNames; -import nu.marginalia.loader.DbTestUtil; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.model.processed.DomainLinkRecord; import nu.marginalia.model.processed.DomainRecord; @@ -21,10 +20,8 @@ import org.testcontainers.junit.jupiter.Testcontainers; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.sql.SQLException; import java.util.*; import java.util.stream.Collectors; -import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.*; @@ -99,7 +96,7 @@ class DomainLoaderServiceTest { // Verify Set expectedDomains1 = Sets.union(new HashSet<>(domains1), new HashSet<>(domains2)); - assertEquals(expectedDomains1, domainService.readSetDomainNames(new LoaderInputData(workDir, 2))); + assertEquals(expectedDomains1, domainService.readBasicDomainInformation(new LoaderInputData(workDir, 2)).stream().map(d -> d.domain).collect(Collectors.toSet())); Set expectedDomains2 = new HashSet<>(linkDomains); assertEquals(expectedDomains2, domainService.readReferencedDomainNames(new LoaderInputData(workDir, 2))); diff --git a/code/services-application/api-service/src/main/java/nu/marginalia/api/ApiSearchOperator.java b/code/services-application/api-service/src/main/java/nu/marginalia/api/ApiSearchOperator.java index 7dca777c..f60a1750 100644 --- a/code/services-application/api-service/src/main/java/nu/marginalia/api/ApiSearchOperator.java +++ b/code/services-application/api-service/src/main/java/nu/marginalia/api/ApiSearchOperator.java @@ -10,8 +10,6 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.client.model.results.DecoratedSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultKeywordScore; import nu.marginalia.index.query.limit.QueryLimits; -import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.query.client.QueryClient; import nu.marginalia.query.model.QueryParams; @@ -64,7 +62,7 @@ public class ApiSearchOperator { return switch (index) { case 0 -> SearchSetIdentifier.NONE; case 1 -> SearchSetIdentifier.SMALLWEB; - case 2 -> SearchSetIdentifier.RETRO; + case 2 -> SearchSetIdentifier.POPULAR; case 3 -> SearchSetIdentifier.NONE; case 5 -> SearchSetIdentifier.NONE; default -> SearchSetIdentifier.NONE; diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchService.java index 13f14e6e..1ed36a98 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchService.java @@ -31,7 +31,6 @@ public class SearchService extends Service { SearchFrontPageService frontPageService, SearchErrorPageService errorPageService, SearchAddToCrawlQueueService addToCrawlQueueService, - SearchFlagSiteService flagSiteService, SearchSiteInfoService siteInfoService, SearchQueryService searchQueryService ) { diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java deleted file mode 100644 index 2491258d..00000000 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java +++ /dev/null @@ -1,28 +0,0 @@ -package nu.marginalia.search.model; - -import lombok.*; -import nu.marginalia.model.EdgeDomain; - -import java.util.List; - -@Getter @AllArgsConstructor @NoArgsConstructor @Builder -@ToString -public class DomainInformation { - EdgeDomain domain; - - boolean blacklisted; - int pagesKnown; - int pagesFetched; - int pagesIndexed; - int incomingLinks; - int outboundLinks; - int nodeAffinity; - double ranking; - - boolean suggestForCrawling; - boolean inCrawlQueue; - boolean unknownDomain; - - String state; - List linkingDomains; -} diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java index 0ce79b31..3afdef7f 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java @@ -5,7 +5,6 @@ import nu.marginalia.WebsiteUrl; import nu.marginalia.search.command.SearchAdtechParameter; import nu.marginalia.search.command.SearchJsParameter; import nu.marginalia.search.command.SearchParameters; -import org.apache.regexp.RE; import java.util.List; @@ -37,7 +36,7 @@ public class SearchFilters { filterGroups = List.of( List.of( new Filter("No Filter", SearchProfile.NO_FILTER, parameters), - new Filter("Popular", SearchProfile.DEFAULT, parameters), + new Filter("Popular", SearchProfile.POPULAR, parameters), new Filter("Small Web", SearchProfile.SMALLWEB, parameters), new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters), new Filter("Academia", SearchProfile.ACADEMIA, parameters) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index ab61ca11..e0daa79c 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -8,19 +8,16 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier; import java.util.Objects; public enum SearchProfile { - DEFAULT("default", SearchSetIdentifier.RETRO), + POPULAR("default", SearchSetIdentifier.POPULAR), SMALLWEB("modern", SearchSetIdentifier.SMALLWEB), BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS), NO_FILTER("corpo", SearchSetIdentifier.NONE), - YOLO("yolo", SearchSetIdentifier.NONE), VINTAGE("vintage", SearchSetIdentifier.NONE), TILDE("tilde", SearchSetIdentifier.NONE), CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE), - ACADEMIA("academia", SearchSetIdentifier.ACADEMIA), + ACADEMIA("academia", SearchSetIdentifier.NONE), PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE), - FOOD("food", SearchSetIdentifier.NONE), - CRAFTS("crafts", SearchSetIdentifier.NONE), - CLASSICS("classics", SearchSetIdentifier.NONE), + FOOD("food", SearchSetIdentifier.POPULAR), FORUM("forum", SearchSetIdentifier.NONE), WIKI("wiki", SearchSetIdentifier.NONE), DOCS("docs", SearchSetIdentifier.NONE), @@ -38,7 +35,7 @@ public enum SearchProfile { private final static SearchProfile[] values = values(); public static SearchProfile getSearchProfile(String param) { if (null == param) { - return NO_FILTER; + return POPULAR; } for (var profile : values) { @@ -47,12 +44,12 @@ public enum SearchProfile { } } - return NO_FILTER; + return POPULAR; } public void addTacitTerms(SearchSubquery subquery) { if (this == ACADEMIA) { - subquery.searchTermsPriority.add("tld:edu"); + subquery.searchTermsAdvice.add("special:academia"); } if (this == VINTAGE) { subquery.searchTermsPriority.add("format:html123"); @@ -75,9 +72,7 @@ public enum SearchProfile { } if (this == FOOD) { subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); - } - if (this == CRAFTS) { - subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword()); + subquery.searchTermsExclude.add("special:ads"); } } @@ -106,13 +101,5 @@ public enum SearchProfile { else return SpecificationLimit.none(); } - - - public String getNearDomain() { - if (this == CLASSICS) { - return "classics.mit.edu"; - } - return null; - } } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java deleted file mode 100644 index c05cfec2..00000000 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java +++ /dev/null @@ -1,276 +0,0 @@ -package nu.marginalia.search.siteinfo; - -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.db.DbDomainQueries; -import nu.marginalia.search.model.DomainInformation; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import java.sql.SQLException; -import java.util.*; - -/* - TODO: This class needs to be refactored, a lot of - these SQL queries are redundant and can be - collapsed into one single query that fetches - all the information - */ -@Singleton -public class DomainInformationService { - - private DbDomainQueries dbDomainQueries; - private HikariDataSource dataSource; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Inject - public DomainInformationService( - DbDomainQueries dbDomainQueries, - HikariDataSource dataSource) { - this.dbDomainQueries = dbDomainQueries; - this.dataSource = dataSource; - } - - - public Optional domainInfo(String site) { - - OptionalInt maybeDomainId = getDomainFromPartial(site); - if (maybeDomainId.isEmpty()) { - return Optional.empty(); - } - int domainId = maybeDomainId.getAsInt(); - - Optional domain = dbDomainQueries.getDomain(domainId); - if (domain.isEmpty()) { - return Optional.empty(); - } - - boolean blacklisted = isBlacklisted(domain.get()); - int pagesKnown = getPagesKnown(domainId); - int pagesVisited = getPagesVisited(domainId); - int pagesIndexed = getPagesIndexed(domainId); - int incomingLinks = getIncomingLinks(domainId); - int outboundLinks = getOutboundLinks(domainId); - int nodeAffinity = getNodeAffinity(domainId); - boolean inCrawlQueue = inCrawlQueue(domainId); - - double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; - - DomainIndexingState state = getDomainState(domainId); - List linkingDomains = getLinkingDomains(domainId); - - var di = DomainInformation.builder() - .domain(domain.get()) - .blacklisted(blacklisted) - .pagesKnown(pagesKnown) - .pagesFetched(pagesVisited) - .pagesIndexed(pagesIndexed) - .incomingLinks(incomingLinks) - .outboundLinks(outboundLinks) - .ranking(rank) - .state(state.desc) - .linkingDomains(linkingDomains) - .inCrawlQueue(inCrawlQueue) - .nodeAffinity(nodeAffinity) - .suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue)) - .build(); - - return Optional.of(di); - } - - private int getNodeAffinity(int domainId) { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement(""" - SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE ID=? - """)) { - stmt.setInt(1, domainId); - var rs = stmt.executeQuery(); - if (rs.next()) - return rs.getInt(1); - } - } - catch (SQLException ex) { - logger.error("SQL error", ex); - } - return -1; - } - - @SneakyThrows - private boolean inCrawlQueue(int domainId) { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - SELECT 1 FROM CRAWL_QUEUE - INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME - WHERE EC_DOMAIN.ID=? - """)) - { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - return rsp.next(); - } - } - } - - private OptionalInt getDomainFromPartial(String site) { - return dbDomainQueries.tryGetDomainId(new EdgeDomain(site)); - } - - @SneakyThrows - public boolean isBlacklisted(EdgeDomain domain) { - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN IN (?,?)")) { - stmt.setString(1, domain.domain); - stmt.setString(2, domain.toString()); - var rsp = stmt.executeQuery(); - return rsp.next(); - } - } - } - - @SneakyThrows - public int getPagesKnown(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - public int getPagesVisited(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - - @SneakyThrows - public int getPagesIndexed(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - public int getIncomingLinks(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - @SneakyThrows - public int getOutboundLinks(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - public DomainIndexingState getDomainState(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return DomainIndexingState.valueOf(rsp.getString(1)); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return DomainIndexingState.ERROR; - } - - public List getLinkingDomains(int domainId) { - try (var connection = dataSource.getConnection()) { - List results = new ArrayList<>(25); - try (var stmt = connection.prepareStatement("SELECT SOURCE_DOMAIN FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - results.add(new EdgeDomain(rsp.getString(1))); - } - return results; - } catch (Exception ex) { - logger.error("DB error", ex); - } - - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return Collections.emptyList(); - } - - public double getRank(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return 1; - } -} diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index c5f918d0..8cb9a84f 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -1,16 +1,16 @@ package nu.marginalia.search.svc; import com.google.inject.Inject; +import nu.marginalia.assistant.client.AssistantClient; +import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.client.Context; import nu.marginalia.db.DbDomainQueries; -import nu.marginalia.db.DomainBlacklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.search.SearchOperator; -import nu.marginalia.search.model.DomainInformation; +import nu.marginalia.assistant.client.model.DomainInformation; import nu.marginalia.search.model.UrlDetails; -import nu.marginalia.search.siteinfo.DomainInformationService; import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData; import spark.Request; import spark.Response; @@ -23,22 +23,19 @@ import java.util.Map; public class SearchSiteInfoService { private final SearchOperator searchOperator; - private final SimilarDomainsService similarDomains; - private final DomainInformationService domainInformationService; + private final AssistantClient assistantClient; private final SearchFlagSiteService flagSiteService; private final DbDomainQueries domainQueries; private final MustacheRenderer renderer; @Inject public SearchSiteInfoService(SearchOperator searchOperator, - SimilarDomainsService similarDomains, - DomainInformationService domainInformationService, + AssistantClient assistantClient, RendererFactory rendererFactory, SearchFlagSiteService flagSiteService, DbDomainQueries domainQueries) throws IOException { this.searchOperator = searchOperator; - this.similarDomains = similarDomains; - this.domainInformationService = domainInformationService; + this.assistantClient = assistantClient; this.flagSiteService = flagSiteService; this.domainQueries = domainQueries; @@ -108,13 +105,6 @@ public class SearchSiteInfoService { false); } - private DomainInformation dummyInformation(String domainName) { - return DomainInformation.builder() - .domain(new EdgeDomain(domainName)) - .suggestForCrawling(true) - .unknownDomain(true) - .build(); - } private Backlinks listLinks(Context ctx, String domainName) { return new Backlinks(domainName, @@ -126,13 +116,24 @@ public class SearchSiteInfoService { final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1); - final DomainInformation domainInfo = domainInformationService.domainInfo(domainName) - .orElseGet(() -> dummyInformation(domainName)); + final DomainInformation domainInfo; + final List similarSet; + final List linkingDomains; - final List similarSet = - similarDomains.getSimilarDomains(domainId, 100); - final List linkingDomains = - similarDomains.getLinkingDomains(domainId, 100); + if (domainId < 0 || !assistantClient.isAccepting()) { + domainInfo = createDummySiteInfo(domainName); + similarSet = List.of(); + linkingDomains = List.of(); + } + else { + domainInfo = assistantClient.domainInformation(ctx, domainId).blockingFirst(); + similarSet = assistantClient + .similarDomains(ctx, domainId, 100) + .blockingFirst(); + linkingDomains = assistantClient + .linkedDomains(ctx, domainId, 100) + .blockingFirst(); + } return new SiteInfoWithContext(domainName, domainId, @@ -141,6 +142,15 @@ public class SearchSiteInfoService { linkingDomains ); } + + private DomainInformation createDummySiteInfo(String domainName) { + return DomainInformation.builder() + .domain(new EdgeDomain(domainName)) + .suggestForCrawling(true) + .unknownDomain(true) + .build(); + } + private Docs listDocs(Context ctx, String domainName) { return new Docs(domainName, domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), @@ -181,13 +191,13 @@ public class SearchSiteInfoService { String domain, long domainId, DomainInformation domainInformation, - List similar, - List linking) { + List similar, + List linking) { public SiteInfoWithContext(String domain, long domainId, DomainInformation domainInformation, - List similar, - List linking + List similar, + List linking ) { this(Map.of("info", true), diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb index f911d3db..88b6ad84 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb @@ -99,7 +99,6 @@ + diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb index 5b6e40dd..979226a1 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb @@ -6,5 +6,6 @@ Pages Known: {{pagesKnown}}
Pages Crawled: {{pagesFetched}}
Pages Indexed: {{pagesIndexed}}
+ IP: {{ip}} {{#if ipCountry}}{{getIpFlag}}{{/if}}

\ No newline at end of file diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index 89f90eee..8609903d 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -27,11 +27,13 @@ dependencies { implementation project(':code:common:config') implementation project(':code:common:service') implementation project(':code:common:model') + implementation project(':code:common:db') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') implementation project(':code:features-search:screenshots') + implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java index 3992986b..592e6308 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java @@ -3,6 +3,8 @@ package nu.marginalia.assistant; import com.google.gson.Gson; import com.google.inject.Inject; import lombok.SneakyThrows; +import nu.marginalia.assistant.domains.DomainInformationService; +import nu.marginalia.assistant.domains.SimilarDomainsService; import nu.marginalia.assistant.eval.Units; import nu.marginalia.assistant.suggest.Suggestions; import nu.marginalia.assistant.eval.MathParser; @@ -16,11 +18,16 @@ import spark.Request; import spark.Response; import spark.Spark; +import java.util.ArrayList; +import java.util.Objects; + public class AssistantService extends Service { private final Logger logger = LoggerFactory.getLogger(getClass()); private final Gson gson = GsonFactory.get(); private final Units units; private final MathParser mathParser; + private final SimilarDomainsService similarDomainsService; + private final DomainInformationService domainInformationService; private final Suggestions suggestions; @SneakyThrows @@ -30,12 +37,16 @@ public class AssistantService extends Service { MathParser mathParser, Units units, ScreenshotService screenshotService, + SimilarDomainsService similarDomainsService, + DomainInformationService domainInformationService, Suggestions suggestions) { super(params); this.mathParser = mathParser; this.units = units; + this.similarDomainsService = similarDomainsService; + this.domainInformationService = domainInformationService; this.suggestions = suggestions; Spark.staticFiles.expireTime(600); @@ -56,12 +67,50 @@ public class AssistantService extends Service { rsp, req.queryParams("value") )); - + Spark.get("/domain/:id/similar", this::getSimilarDomains, this::convertToJson); + Spark.get("/domain/:id/linking", this::getLinkingDomains, this::convertToJson); + Spark.get("/domain/:id/info", this::getDomainInformation, this::convertToJson); Spark.get("/public/suggest/", this::getSuggestions, this::convertToJson); Spark.awaitInitialization(); } + private Object getSimilarDomains(Request request, Response response) { + int domainId = Integer.parseInt(request.params("id")); + int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25")); + + response.type("application/json"); + + if (!similarDomainsService.isReady()) { + return new ArrayList<>(); + } + + return similarDomainsService.getSimilarDomains(domainId, count); + } + + private Object getLinkingDomains(Request request, Response response) { + int domainId = Integer.parseInt(request.params("id")); + int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25")); + + response.type("application/json"); + if (!similarDomainsService.isReady()) { + return new ArrayList<>(); + } + return similarDomainsService.getLinkingDomains(domainId, count); + } + + private Object getDomainInformation(Request request, Response response) { + int domainId = Integer.parseInt(request.params("id")); + + response.type("application/json"); + + var maybeDomainInfo = domainInformationService.domainInfo(domainId); + if (maybeDomainInfo.isEmpty()) { + Spark.halt(404); + } + return maybeDomainInfo.get(); + } + private Object getSuggestions(Request request, Response response) { response.type("application/json"); var param = request.queryParams("partial"); diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java new file mode 100644 index 00000000..690509db --- /dev/null +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java @@ -0,0 +1,115 @@ +package nu.marginalia.assistant.domains; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.geoip.GeoIpDictionary; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.assistant.client.model.DomainInformation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.inject.Inject; +import com.google.inject.Singleton; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.*; + +@Singleton +public class DomainInformationService { + private final GeoIpDictionary geoIpDictionary; + + private DbDomainQueries dbDomainQueries; + private HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @Inject + public DomainInformationService( + DbDomainQueries dbDomainQueries, + GeoIpDictionary geoIpDictionary, + HikariDataSource dataSource) { + this.dbDomainQueries = dbDomainQueries; + this.geoIpDictionary = geoIpDictionary; + this.dataSource = dataSource; + } + + + public Optional domainInfo(int domainId) { + + Optional domain = dbDomainQueries.getDomain(domainId); + if (domain.isEmpty()) { + return Optional.empty(); + } + + + var builder = DomainInformation.builder(); + try (var connection = dataSource.getConnection(); + var stmt = connection.createStatement(); + ) { + boolean inCrawlQueue; + int outboundLinks = 0; + int pagesVisited = 0; + + ResultSet rs; + + rs = stmt.executeQuery(STR.""" + SELECT IP, NODE_AFFINITY, DOMAIN_NAME, STATE, IFNULL(RANK, 1) AS RANK + FROM EC_DOMAIN WHERE ID=\{domainId} + """); + if (rs.next()) { + String ip = rs.getString("IP"); + + builder.ip(ip); + builder.ipCountry(geoIpDictionary.getCountry(ip)); + + builder.nodeAffinity(rs.getInt("NODE_AFFINITY")); + builder.domain(new EdgeDomain(rs.getString("DOMAIN_NAME"))); + builder.state(rs.getString("STATE")); + builder.ranking(Math.round(100.0*(1.0-rs.getDouble("RANK")))); + } + rs = stmt.executeQuery(STR.""" + SELECT 1 FROM CRAWL_QUEUE + INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME + WHERE EC_DOMAIN.ID=\{domainId} + """); + inCrawlQueue = rs.next(); + builder.inCrawlQueue(inCrawlQueue); + + rs = stmt.executeQuery(STR.""" + SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=\{domainId} + """); + if (rs.next()) { + builder.incomingLinks(rs.getInt(1)); + } + + rs = stmt.executeQuery(STR.""" + SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=\{domainId} + """); + if (rs.next()) { + builder.outboundLinks(rs.getInt(1)); + outboundLinks = rs.getInt(1); + } + + + rs = stmt.executeQuery(STR.""" + SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId} + """); + if (rs.next()) { + pagesVisited = rs.getInt("VISITED_URLS"); + + builder.pagesKnown(rs.getInt("KNOWN_URLS")); + builder.pagesIndexed(rs.getInt("GOOD_URLS")); + builder.pagesFetched(rs.getInt("VISITED_URLS")); + } + + builder.suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue)); + + return Optional.of(builder.build()); + } + catch (SQLException ex) { + logger.error("SQL error", ex); + return Optional.empty(); + } + } + +} diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SimilarDomainsService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java similarity index 84% rename from code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SimilarDomainsService.java rename to code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java index 51699dcc..9156408c 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SimilarDomainsService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java @@ -1,4 +1,4 @@ -package nu.marginalia.search.svc; +package nu.marginalia.assistant.domains; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; @@ -6,11 +6,10 @@ import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntDoubleHashMap; import gnu.trove.map.hash.TIntIntHashMap; -import gnu.trove.map.hash.TLongDoubleHashMap; import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,6 +39,8 @@ public class SimilarDomainsService { public volatile double[] domainRanks = null; public volatile String[] domainNames = null; + volatile boolean isReady = false; + @Inject public SimilarDomainsService(HikariDataSource dataSource) { this.dataSource = dataSource; @@ -167,6 +168,7 @@ public class SimilarDomainsService { logger.info("Loaded {} domains", domainRanks.length); logger.info("All done!"); + isReady = true; } } catch (SQLException throwables) { @@ -174,7 +176,11 @@ public class SimilarDomainsService { } } - double getRelatedness(int a, int b) { + public boolean isReady() { + return isReady; + } + + private double getRelatedness(int a, int b) { int lowerIndex = Math.min(domainIdToIdx.get(a), domainIdToIdx.get(b)); int higherIndex = Math.max(domainIdToIdx.get(a), domainIdToIdx.get(b)); @@ -233,14 +239,14 @@ public class SimilarDomainsService { indexedDomains.get(idx), activeDomains.get(idx), screenshotDomains.get(idx), - LinkType.find( + SimilarDomain.LinkType.find( linkingIdsStoD.contains(idx), linkingIdsDtoS.contains(idx) ) )); } - domains.removeIf(d -> d.url.domain.toString().length() > 32); + domains.removeIf(d -> d.url().domain.toString().length() > 32); return domains; } @@ -319,84 +325,16 @@ public class SimilarDomainsService { indexedDomains.get(idx), activeDomains.get(idx), screenshotDomains.get(idx), - LinkType.find( + SimilarDomain.LinkType.find( linkingIdsStoD.contains(idx), linkingIdsDtoS.contains(idx) ) )); } - domains.removeIf(d -> d.url.domain.toString().length() > 32); + domains.removeIf(d -> d.url().domain.toString().length() > 32); return domains; } - public record SimilarDomain(EdgeUrl url, - int domainId, - double relatedness, - double rank, - boolean indexed, - boolean active, - boolean screenshot, - LinkType linkType) - { - - public String getRankSymbols() { - if (rank > 90) { - return "★★★★★"; - } - if (rank > 70) { - return "★★★★"; - } - if (rank > 50) { - return "★★★"; - } - if (rank > 30) { - return "★★"; - } - if (rank > 10) { - return "★"; - } - return ""; - } - } - - enum LinkType { - BACKWARD, - FOWARD, - BIDIRECTIONAL, - NONE; - - public static LinkType find(boolean linkStod, - boolean linkDtos) - { - if (linkDtos && linkStod) - return BIDIRECTIONAL; - if (linkDtos) - return FOWARD; - if (linkStod) - return BACKWARD; - - return NONE; - } - - public String toString() { - return switch (this) { - case FOWARD -> "→"; - case BACKWARD -> "←"; - case BIDIRECTIONAL -> "⇆"; - case NONE -> "-"; - }; - } - - public String getDescription() { - return switch (this) { - case BACKWARD -> "Backward Link"; - case FOWARD -> "Forward Link"; - case BIDIRECTIONAL -> "Mutual Link"; - case NONE -> "No Link"; - }; - } - }; - } \ No newline at end of file diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java index a75ab75a..7adf7921 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java @@ -20,8 +20,9 @@ import java.util.stream.Collectors; import java.util.stream.Stream; public class Suggestions { - private final PatriciaTrie suggestionsTrie; - private final TermFrequencyDict termFrequencyDict; + private PatriciaTrie suggestionsTrie = null; + private TermFrequencyDict termFrequencyDict = null; + private volatile boolean ready = false; private final SpellChecker spellChecker; private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$"); @@ -35,10 +36,12 @@ public class Suggestions { ) { this.spellChecker = spellChecker; - suggestionsTrie = loadSuggestions(suggestionsFile); - termFrequencyDict = dict; - - logger.info("Loaded {} suggestions", suggestionsTrie.size()); + Thread.ofPlatform().start(() -> { + suggestionsTrie = loadSuggestions(suggestionsFile); + termFrequencyDict = dict; + ready = true; + logger.info("Loaded {} suggestions", suggestionsTrie.size()); + }); } private static PatriciaTrie loadSuggestions(Path file) { @@ -71,6 +74,9 @@ public class Suggestions { } public List getSuggestions(int count, String searchWord) { + if (!ready) + return Collections.emptyList(); + if (searchWord.length() < MIN_SUGGEST_LENGTH) { return Collections.emptyList(); } @@ -126,6 +132,9 @@ public class Suggestions { public Stream getSuggestionsForKeyword(int count, String prefix) { + if (!ready) + return Stream.empty(); + if (prefix.length() < MIN_SUGGEST_LENGTH) { return Stream.empty(); } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/RecrawlAllActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/RecrawlAllActor.java index a6733b63..e0268901 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/RecrawlAllActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/RecrawlAllActor.java @@ -25,7 +25,7 @@ public class RecrawlAllActor extends RecordActorPrototype { public record Initial() implements ActorStep {} - public record WaitFinished(int node) implements ActorStep {} + public record WaitFinished(int node, long msgId) implements ActorStep {} @Resume(behavior=ActorResumeBehavior.RETRY) public record Trigger(int node) implements ActorStep {} public record AdvanceNode(int node) implements ActorStep {} @@ -49,17 +49,18 @@ public class RecrawlAllActor extends RecordActorPrototype { var data = new ExecutorRemoteActorFactory.CrawlData(activeFileStorage.get(0), true); - if (remoteActorFactory.createCrawlRemote(node).trigger(data)) { - yield new WaitFinished(node); + long msgId = remoteActorFactory.createCrawlRemote(node).trigger(data); + if (msgId >= 0) { + yield new WaitFinished(node, msgId); } else { yield new AdvanceNode(node); } } - case WaitFinished(int node) -> { + case WaitFinished(int node, long msgId) -> { var remoteActor = remoteActorFactory.createCrawlRemote(node); for (;;) { - var state = remoteActor.getState(); + var state = remoteActor.getState(msgId); if ("END".equals(state) || "ERROR".equals(state)) { break; } @@ -80,8 +81,7 @@ public class RecrawlAllActor extends RecordActorPrototype { public RecrawlAllActor(Gson gson, ExecutorRemoteActorFactory remoteActorFactory, FileStorageService fileStorageService, - PrecessionNodes precessionNodes, - NodeConfigurationService nodeConfigurationService) + PrecessionNodes precessionNodes) { super(gson); this.remoteActorFactory = remoteActorFactory; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/ReprocessAllActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/ReprocessAllActor.java index 416f4d90..d37ecc2a 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/ReprocessAllActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/ReprocessAllActor.java @@ -24,7 +24,7 @@ public class ReprocessAllActor extends RecordActorPrototype { public record Initial() implements ActorStep {} - public record WaitFinished(int node) implements ActorStep {} + public record WaitFinished(int node, long msgId) implements ActorStep {} @Resume(behavior=ActorResumeBehavior.RETRY) public record Trigger(int node) implements ActorStep {} public record AdvanceNode(int node) implements ActorStep {} @@ -47,17 +47,18 @@ public class ReprocessAllActor extends RecordActorPrototype { var data = new ExecutorRemoteActorFactory.ConvertAndLoadData(activeFileStorage.get(0)); - if (remoteActorFactory.createConvertAndLoadRemote(node).trigger(data)) { - yield new WaitFinished(node); + long msgId = remoteActorFactory.createConvertAndLoadRemote(node).trigger(data); + if (msgId >= 0) { + yield new WaitFinished(node, msgId); } else { yield new AdvanceNode(node); } } - case WaitFinished(int node) -> { + case WaitFinished(int node, long msgId) -> { var remoteActor = remoteActorFactory.createConvertAndLoadRemote(node); for (;;) { - var state = remoteActor.getState(); + var state = remoteActor.getState(msgId); if ("END".equals(state) || "ERROR".equals(state)) break; TimeUnit.SECONDS.sleep(10); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index 4c06bf2f..47dcf5b2 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -37,7 +37,7 @@ public class IndexSearchSetsService { // Below are binary indices that are used to constrain a search - private volatile RankingSearchSet retroSet; + private volatile RankingSearchSet popularSet; private volatile RankingSearchSet smallWebSet; private volatile RankingSearchSet academiaSet; private volatile RankingSearchSet blogsSet; @@ -72,7 +72,7 @@ public class IndexSearchSetsService { smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat")); - retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat")); + popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, servicesFactory.getSearchSetsBase().resolve("popular.dat")); blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat")); } @@ -86,7 +86,7 @@ public class IndexSearchSetsService { } return switch (searchSetIdentifier) { case NONE -> anySet; - case RETRO -> retroSet; + case POPULAR -> popularSet; case ACADEMIA -> academiaSet; case SMALLWEB -> smallWebSet; case BLOGS -> blogsSet; @@ -95,7 +95,7 @@ public class IndexSearchSetsService { enum RepartitionSteps { UPDATE_ACADEMIA, - UPDATE_RETRO, + UPDATE_POPULAR, UPDATE_SMALL_WEB, UPDATE_BLOGS, UPDATE_RANKINGS, @@ -107,8 +107,8 @@ public class IndexSearchSetsService { processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA); updateAcademiaDomainsSet(); - processHeartbeat.progress(RepartitionSteps.UPDATE_RETRO); - updateRetroDomainsSet(); + processHeartbeat.progress(RepartitionSteps.UPDATE_POPULAR); + updatePopularDomainsSet(); processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB); updateSmallWebDomainsSet(); @@ -139,15 +139,15 @@ public class IndexSearchSetsService { } @SneakyThrows - public void updateRetroDomainsSet() { + public void updatePopularDomainsSet() { var entry = rankingSettings.retro; var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new)); var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new); synchronized (this) { - retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data); - retroSet.write(); + popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, popularSet.source, data); + popularSet.write(); } } diff --git a/settings.gradle b/settings.gradle index 59f42bec..af8a45f5 100644 --- a/settings.gradle +++ b/settings.gradle @@ -12,6 +12,7 @@ include 'code:services-application:dating-service' include 'code:services-application:explorer-service' include 'code:libraries:array' +include 'code:libraries:geo-ip' include 'code:libraries:btree' include 'code:libraries:easy-lsh' include 'code:libraries:guarded-regex'