Merge branch 'master' into warc

This commit is contained in:
Viktor Lofgren 2023-12-11 14:30:20 +01:00
commit 45987a1d98
57 changed files with 989 additions and 804 deletions

View File

@ -1,12 +1,14 @@
package nu.marginalia.assistant.client;
import com.google.gson.reflect.TypeToken;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.assistant.client.model.DictionaryResponse;
import nu.marginalia.assistant.client.model.DomainInformation;
import nu.marginalia.assistant.client.model.SimilarDomain;
import nu.marginalia.client.AbstractDynamicClient;
import nu.marginalia.client.exception.RouteNotConfiguredException;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.service.descriptor.ServiceDescriptors;
import nu.marginalia.service.id.ServiceId;
@ -14,6 +16,7 @@ import nu.marginalia.client.Context;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@Singleton
@ -59,4 +62,31 @@ public class AssistantClient extends AbstractDynamicClient {
return Observable.empty();
}
}
public Observable<ArrayList<SimilarDomain>> similarDomains(Context ctx, int domainId, int count) {
try {
return super.get(ctx, 0, STR."/domain/\{domainId}/similar?count=\{count}", new TypeToken<ArrayList<SimilarDomain>>() {});
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
public Observable<ArrayList<SimilarDomain>> linkedDomains(Context ctx, int domainId, int count) {
try {
return super.get(ctx, 0, STR."/domain/\{domainId}/linking?count=\{count}", new TypeToken<ArrayList<SimilarDomain>>() {});
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
public Observable<DomainInformation> domainInformation(Context ctx, int domainId) {
try {
return super.get(ctx, 0, STR."/domain/\{domainId}/info", DomainInformation.class);
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
}

View File

@ -0,0 +1,42 @@
package nu.marginalia.assistant.client.model;
import lombok.*;
import nu.marginalia.model.EdgeDomain;
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
@ToString
public class DomainInformation {
EdgeDomain domain;
boolean blacklisted;
int pagesKnown;
int pagesFetched;
int pagesIndexed;
int incomingLinks;
int outboundLinks;
int nodeAffinity;
double ranking;
boolean suggestForCrawling;
boolean inCrawlQueue;
boolean unknownDomain;
String ip;
String ipCountry;
String state;
public String getIpFlag() {
if (ipCountry == null || ipCountry.isBlank()) {
return "";
}
String country = ipCountry;
if ("UK".equals(country)) {
return "GB";
}
int offset = 0x1F1E6;
int asciiOffset = 0x41;
int firstChar = Character.codePointAt(country, 0) - asciiOffset + offset;
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.assistant.client.model;
import nu.marginalia.model.EdgeUrl;
public record SimilarDomain(EdgeUrl url,
int domainId,
double relatedness,
double rank,
boolean indexed,
boolean active,
boolean screenshot,
LinkType linkType) {
public String getRankSymbols() {
if (rank > 90) {
return "&#9733;&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 70) {
return "&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 50) {
return "&#9733;&#9733;&#9733;";
}
if (rank > 30) {
return "&#9733;&#9733;";
}
if (rank > 10) {
return "&#9733;";
}
return "";
}
public enum LinkType {
BACKWARD,
FOWARD,
BIDIRECTIONAL,
NONE;
public static LinkType find(boolean linkStod,
boolean linkDtos) {
if (linkDtos && linkStod)
return BIDIRECTIONAL;
if (linkDtos)
return FOWARD;
if (linkStod)
return BACKWARD;
return NONE;
}
public String toString() {
return switch (this) {
case FOWARD -> "&#8594;";
case BACKWARD -> "&#8592;";
case BIDIRECTIONAL -> "&#8646;";
case NONE -> "-";
};
}
public String getDescription() {
return switch (this) {
case BACKWARD -> "Backward Link";
case FOWARD -> "Forward Link";
case BIDIRECTIONAL -> "Mutual Link";
case NONE -> "No Link";
};
}
}
}

View File

@ -34,8 +34,23 @@ public class ExecutorRemoteActorFactory {
}
public interface ExecutorRemoteActorIf<T> {
boolean trigger(T object) throws Exception;
String getState();
/** Trigger the remote actor with the given object. The object will be serialized to JSON and sent to the
* remote actor. If the remote actor does not respond after a time period, a timeout will occur and a negative
* message id will be returned.
*
* @param object The message to send to the remote actot
* @return The message id of the response message, or a negative number if the remote actor did not respond
* within a reasonable timeout seconds.
*/
long trigger(T object) throws Exception;
/** Get the last finished state of the actor.
* <p>
* The message id of the request initiating the actor must be provided to ensure that
* we don't get a state from a previous run.
*/
String getState(long fromMsgId);
}
public record CrawlData(FileStorageId storageId, boolean cascadeLoad) {}
@ -58,11 +73,11 @@ class ExecutorRemoteActor<T> implements ExecutorRemoteActorFactory.ExecutorRemot
this.triggerFunction = triggerFunction;
}
public boolean trigger(T object) throws Exception {
public long trigger(T object) throws Exception {
return trigger(gson.toJson(object));
}
public boolean trigger(String payload) throws Exception {
public long trigger(String payload) throws Exception {
long id = persistence.sendNewMessage(inboxName, null, null, triggerFunction, payload, null);
// Wait for the remote actor to respond to the message
@ -70,19 +85,19 @@ class ExecutorRemoteActor<T> implements ExecutorRemoteActorFactory.ExecutorRemot
for (int i = 0; i < 120; i++) {
var msg = persistence.getMessage(id);
if (msg.state() == MqMessageState.ACK || msg.state() == MqMessageState.OK)
return true;
return id;
if (msg.state() == MqMessageState.ERR || msg.state() == MqMessageState.DEAD)
return false;
return -id;
TimeUnit.SECONDS.sleep(1);
}
return false; // Timeout
return -1; // Timeout
}
public String getState() {
public String getState(long fromMsgId) {
return persistence
.getHeadMessage(inboxName)
.getHeadMessage(inboxName, fromMsgId)
.map(MqMessage::function)
.orElse("INITIAL");
}

View File

@ -7,7 +7,7 @@ package nu.marginalia.index.client.model.query;
* */
public enum SearchSetIdentifier {
NONE,
RETRO,
POPULAR,
BLOGS,
ACADEMIA,
SMALLWEB

View File

@ -1,6 +1,5 @@
package nu.marginalia.client;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import io.reactivex.rxjava3.core.Scheduler;
import io.reactivex.rxjava3.schedulers.Schedulers;
import org.slf4j.Logger;
@ -10,26 +9,16 @@ import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadFactory;
public class AbortingScheduler {
private final ThreadFactory threadFactory;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Nullable
private ExecutorService executorService;
public AbortingScheduler(String name) {
threadFactory = new ThreadFactoryBuilder()
.setNameFormat(name+"client--%d")
.setUncaughtExceptionHandler(this::handleException)
.build();
public AbortingScheduler() {
}
private void handleException(Thread thread, Throwable throwable) {
logger.error("Uncaught exception during Client IO in thread {}", thread.getName(), throwable);
}
public synchronized Scheduler get() {
return Schedulers.from(getExecutorService(),
@ -40,14 +29,14 @@ public class AbortingScheduler {
public synchronized void abort() {
if (null != executorService) {
executorService.shutdownNow();
executorService = Executors.newFixedThreadPool(16, threadFactory);
executorService = Executors.newVirtualThreadPerTaskExecutor();
}
}
@Nonnull
private synchronized ExecutorService getExecutorService() {
if (null == executorService) {
executorService = Executors.newFixedThreadPool(16, threadFactory);
executorService = Executors.newVirtualThreadPerTaskExecutor();
}
return executorService;
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.client;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import com.google.protobuf.GeneratedMessageV3;
import io.reactivex.rxjava3.core.Observable;
import io.reactivex.rxjava3.core.ObservableSource;
@ -20,6 +21,7 @@ import org.slf4j.LoggerFactory;
import spark.utils.IOUtils;
import java.io.OutputStream;
import java.lang.reflect.Type;
import java.net.ConnectException;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
@ -233,6 +235,22 @@ public abstract class AbstractClient implements AutoCloseable {
.doFinally(() -> ThreadContext.remove("outbound-request"));
}
protected synchronized <T> Observable<T> get(Context ctx, int node, String endpoint, TypeToken<T> type) {
ensureAlive(node);
var req = ctx.paint(new Request.Builder()).url(serviceRoutes.get(node) + endpoint).get().build();
return Observable.just(client.newCall(req))
.subscribeOn(scheduler().get())
.map(this::logInbound)
.map(Call::execute)
.map(this::logOutbound)
.map(rsp -> validateResponseStatus(rsp, req, 200))
.map(rsp -> getEntity(rsp, type))
.retryWhen(this::retryHandler)
.timeout(timeout, TimeUnit.SECONDS)
.doFinally(() -> ThreadContext.remove("outbound-request"));
}
protected synchronized Observable<Integer> get(Context ctx, int node, String endpoint, OutputStream outputStream) {
ensureAlive(node);
@ -388,6 +406,15 @@ public abstract class AbstractClient implements AutoCloseable {
}
}
@SneakyThrows
private <T> T getEntity(Response response, TypeToken<T> clazz) {
try (response) {
return gson.fromJson(response.body().charStream(), clazz);
}
catch (Exception ex) {
throw ex;
}
}
@SneakyThrows
private String getText(Response response) {
try (response) {
return response.body().string();

View File

@ -1,8 +1,6 @@
package nu.marginalia.client;
import com.google.gson.Gson;
import nu.marginalia.client.route.RouteProvider;
import nu.marginalia.client.route.ServiceRoute;
import nu.marginalia.service.descriptor.ServiceDescriptor;
import javax.annotation.Nonnull;
@ -20,7 +18,7 @@ public class AbstractDynamicClient extends AbstractClient {
);
this.service = service;
this.scheduler = new AbortingScheduler(name());
this.scheduler = new AbortingScheduler();
}
@Override

View File

@ -42,7 +42,7 @@ public class AbstractClientTest {
client = new AbstractClient(new RouteProvider(new ServiceDescriptor(ServiceId.Api, "localhost")), 1, Gson::new) {
@Override
public AbortingScheduler scheduler() {
return new AbortingScheduler(name());
return new AbortingScheduler();
}
@Override

View File

@ -15,6 +15,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:guarded-regex')
implementation project(':code:libraries:geo-ip')
implementation libs.notnull

View File

@ -1,72 +1,31 @@
package nu.marginalia.ip_blocklist;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.opencsv.CSVReader;
import com.opencsv.exceptions.CsvValidationException;
import lombok.AllArgsConstructor;
import nu.marginalia.WmsaHome;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
import java.net.InetAddress;
import java.util.Set;
import java.util.TreeMap;
@Singleton
public class GeoIpBlocklist {
private final TreeMap<Long, GeoIpBlocklist.IpRange> ranges = new TreeMap<>();
/** These countries are extremely overrepresented among the problematic and spammy domains,
* and blocking them is by far the most effective spam mitigation technique. Sucks we throw
* babies out with the bathwater, but it's undeniably effective.
*/
private final Set<String> blacklist = Set.of("CN", "HK");
private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA");
private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class);
@AllArgsConstructor
static class IpRange {
public final long from;
public final long to;
public final String country;
}
private final GeoIpDictionary ipDictionary;
public GeoIpBlocklist() throws IOException, CsvValidationException {
var resource = WmsaHome.getIPLocationDatabse();
try (var reader = new CSVReader(new FileReader(resource.toFile()))) {
for (;;) {
String[] vals = reader.readNext();
if (vals == null) {
break;
}
if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) {
continue;
}
var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]),
Long.parseLong(vals[1]),
vals[2]);
ranges.put(range.from, range);
}
}
logger.info("Loaded {} IP ranges", ranges.size());
}
public String getCountry(InetAddress address) {
byte[] bytes = address.getAddress();
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
Long key = ranges.floorKey(ival);
if (null == key) {
return "-";
}
var range = ranges.get(key);
if (ival >= key && ival < range.to) {
return range.country;
}
return "-";
@Inject
public GeoIpBlocklist(GeoIpDictionary ipDictionary) {
this.ipDictionary = ipDictionary;
ipDictionary.waitReady();
}
public boolean isAllowed(EdgeDomain domain) {
@ -84,7 +43,7 @@ public class GeoIpBlocklist {
public String getCountry(EdgeDomain domain) {
try {
return getCountry(InetAddressCache.getAddress(domain));
return ipDictionary.getCountry(InetAddressCache.getAddress(domain));
}
catch (Throwable ex) {
logger.debug("Failed to resolve {}", domain);

View File

@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit;
// We don't want to torture the DNS by resolving the same links over and over and over again
public class InetAddressCache {
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
public static InetAddress getAddress(EdgeDomain domain) throws Throwable {
try {
return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress()));

View File

@ -0,0 +1,139 @@
package nu.marginalia.util;
import lombok.SneakyThrows;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;
/**
* Abstraction for exposing a (typically) read-from-disk -> parallel processing -> sequential output
* workflow as an iterator, where the number of tasks is much larger than the number of cores
*/
public class ProcessingIterator<T> implements Iterator<T> {
private static final Logger logger = LoggerFactory.getLogger(ProcessingIterator.class);
private final LinkedBlockingQueue<T> queue;
private final AtomicBoolean isFinished = new AtomicBoolean(false);
private final ExecutorService executorService;
private final Semaphore sem;
private T next = null;
private final int parallelism;
public ProcessingIterator(int queueSize, int parallelism, ProcessingJob<T> task) {
this.parallelism = parallelism;
queue = new LinkedBlockingQueue<>(queueSize);
executorService = Executors.newFixedThreadPool(parallelism);
sem = new Semaphore(parallelism);
executorService.submit(() -> executeJob(task));
}
private void executeJob(ProcessingJob<T> job) {
try {
job.run(this::executeTask);
} catch (Exception e) {
logger.warn("Exception while processing", e);
} finally {
isFinished.set(true);
}
}
private void executeTask(Task<T> task) {
try {
sem.acquire();
} catch (InterruptedException e) {
return;
}
try {
queue.put(task.get());
} catch (Exception e) {
logger.warn("Exception while processing", e);
} finally {
sem.release();
}
}
/** Returns true if there are more documents to be processed.
* This method may block until we are certain this is true.
* <p>
* This method must be invoked from the same thread that invokes next(),
* (or synchronize between the two)
*/
@Override
@SneakyThrows
public boolean hasNext() {
if (next != null)
return true;
do {
next = queue.poll(1, TimeUnit.SECONDS);
if (next != null) {
return true;
}
} while (expectMore());
if (!executorService.isShutdown()) {
executorService.shutdown();
}
return false;
}
/** Heuristic for if we should expect more documents to be processed,
* _trust but verify_ since we don't run this in an exclusive section
* and may get a false positive. We never expect a false negative though.
*/
private boolean expectMore() {
return !isFinished.get() // we are still reading from the database
|| !queue.isEmpty() // ... or we have documents in the queue
|| sem.availablePermits() < parallelism; // ... or we are still processing documents
}
/** Returns the next document to be processed.
* This method may block until we are certain there is a document to be processed.
* <p>
* This method must be invoked from the same thread that invokes hasNext(),
* (or synchronize between the two)
* <p>
* If this is run after hasNext() returns false, a NoSuchElementException is thrown.
*/
@SneakyThrows
@Override
public T next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
try {
return next;
}
finally {
next = null;
}
}
/**
* A job that produces a sequence of processing tasks that are to be
* performed in parallel
*/
public interface ProcessingJob<T2> {
void run(Consumer<Task<T2>> output) throws Exception;
}
/**
* A single task that produces a result to be iterable via the Iterator interface
* (along with other tasks' outputs)
*/
public interface Task<T> {
T get() throws Exception;
}
}

View File

@ -0,0 +1,24 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
dependencies {
implementation project(':code:common:config')
implementation libs.bundles.slf4j
implementation libs.opencsv
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,6 @@
This micro library handles the GeoIP lookups, mappings from IP addresses
to country codes.
It uses the free ip2location lite database, which is
available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country)
under a CC-BY-SA 4.0 license.

View File

@ -0,0 +1,90 @@
package nu.marginalia.geoip;
import com.opencsv.CSVReader;
import nu.marginalia.WmsaHome;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.net.InetAddress;
import java.util.TreeMap;
public class GeoIpDictionary {
private volatile TreeMap<Long, IpRange> ranges = null;
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
record IpRange(long from, long to, String country) {}
public GeoIpDictionary() {
Thread.ofPlatform().start(() -> {
try (var reader = new CSVReader(new FileReader(WmsaHome.getIPLocationDatabse().toFile()))) {
var dict = new TreeMap<Long, IpRange>();
for (;;) {
String[] vals = reader.readNext();
if (vals == null) {
break;
}
var range = new IpRange(Long.parseLong(vals[0]),
Long.parseLong(vals[1]),
vals[2]);
dict.put(range.from, range);
}
ranges = dict;
logger.info("Loaded {} IP ranges", ranges.size());
} catch (Exception e) {
ranges = new TreeMap<>();
throw new RuntimeException(e);
}
finally {
this.notifyAll();
}
});
}
public boolean isReady() {
return null != ranges;
}
public boolean waitReady() {
while (null == ranges) {
try {
synchronized (this) {
this.wait(1000);
}
} catch (InterruptedException e) {
return false;
}
}
return true;
}
public String getCountry(String ip) {
try {
return getCountry(InetAddress.getByName(ip));
} catch (Exception e) {
return "";
}
}
public String getCountry(InetAddress address) {
if (null == ranges) { // not loaded yet or failed to load
return "";
}
byte[] bytes = address.getAddress();
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
Long key = ranges.floorKey(ival);
if (null == key) {
return "";
}
var range = ranges.get(key);
if (ival >= key && ival < range.to) {
return range.country;
}
return "";
}
}

View File

@ -496,17 +496,21 @@ public class MqPersistence {
return gson;
}
/** Returns the last message sent to this inbox with a state of 'OK' */
public Optional<MqMessage> getHeadMessage(String inboxName) {
/** Returns the last message sent to this inbox with a state of 'OK'
* with an id greater than or equal to fromMsgId
*/
public Optional<MqMessage> getHeadMessage(String inboxName, long fromMsgId) {
try (var conn = dataSource.getConnection();
var query = conn.prepareStatement("""
SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX
FROM MESSAGE_QUEUE
WHERE RECIPIENT_INBOX = ? AND STATE='OK'
WHERE RECIPIENT_INBOX = ? AND STATE='OK' AND ID >= ?
ORDER BY ID DESC LIMIT 1
"""))
{
query.setString(1, inboxName);
query.setLong(2, fromMsgId);
var rs = query.executeQuery();
if (rs.next()) {
long msgId = rs.getLong(1);

View File

@ -4,6 +4,7 @@ import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.ToString;
import nu.marginalia.bigstring.BigString;
import nu.marginalia.model.EdgeUrl;
@Builder
@AllArgsConstructor
@ -35,4 +36,16 @@ public class CrawledDocument implements SerializableCrawlData {
return SERIAL_IDENTIFIER;
}
@Override
public String getDomain() {
if (url == null)
return null;
return EdgeUrl
.parse(url)
.map(EdgeUrl::getDomain)
.map(d -> d.domain)
.orElse(null);
}
}

View File

@ -2,4 +2,5 @@ package nu.marginalia.crawling.model;
public interface SerializableCrawlData {
String getSerialIdentifier();
String getDomain();
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.io.processed;
import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader;
import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.model.processed.DomainWithIp;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
@ -19,10 +20,10 @@ public class DomainRecordParquetFileReader {
}
@NotNull
public static List<String> getDomainNames(Path path) throws IOException {
public static List<DomainWithIp> getBasicDomainInformation(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()),
List.of("domain"))
List.of("domain", "ip"))
.toList();
}

View File

@ -8,7 +8,6 @@ import org.apache.parquet.schema.*;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.sql.Array;
import java.util.ArrayList;
import java.util.List;
@ -48,8 +47,8 @@ public class DomainRecord {
return DomainRecord::dehydrate;
}
public static Hydrator<String, String> newDomainNameHydrator() {
return new DomainNameHydrator();
public static Hydrator<DomainWithIp, DomainWithIp> newDomainNameHydrator() {
return new DomainWithIpHydrator();
}
@ -124,23 +123,26 @@ class DomainHydrator implements Hydrator<DomainRecord, DomainRecord> {
}
}
class DomainNameHydrator implements Hydrator<String, String> {
class DomainWithIpHydrator implements Hydrator<DomainWithIp, DomainWithIp> {
@Override
public String start() {
return "";
public DomainWithIp start() {
return new DomainWithIp();
}
@Override
public String add(String target, String heading, Object value) {
public DomainWithIp add(DomainWithIp target, String heading, Object value) {
if ("domain".equals(heading)) {
return (String) value;
target.domain = (String) value;
}
else if ("ip".equals(heading)) {
target.ip = (String) value;
}
return target;
}
@Override
public String finish(String target) {
public DomainWithIp finish(DomainWithIp target) {
return target;
}
}

View File

@ -0,0 +1,15 @@
package nu.marginalia.model.processed;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.ToString;
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode
@ToString
public class DomainWithIp {
public String domain;
public String ip;
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.io.processed;
import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.model.processed.DomainWithIp;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -53,8 +54,11 @@ class DomainRecordParquetFileReaderTest {
writer.write(second);
}
var domainNames = DomainRecordParquetFileReader.getDomainNames(parquetFile);
assertEquals(List.of("www.marginalia.nu", "memex.marginalia.nu"), domainNames);
var domainInfo = DomainRecordParquetFileReader.getBasicDomainInformation(parquetFile);
assertEquals(List.of(
new DomainWithIp("www.marginalia.nu", "127.0.0.1"),
new DomainWithIp("memex.marginalia.nu", "127.0.0.1")),
domainInfo);
var items = DomainRecordParquetFileReader
.stream(parquetFile)

View File

@ -41,6 +41,7 @@ dependencies {
implementation project(':code:libraries:guarded-regex')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:big-string')
implementation project(':code:libraries:language-processing')

View File

@ -138,10 +138,16 @@ public class ConverterMain {
for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id)))
{
pool.submit(() -> {
ProcessedDomain processed = processor.process(domain);
converterWriter.accept(processed);
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
try {
ProcessedDomain processed = processor.process(domain);
converterWriter.accept(processed);
}
catch (Exception ex) {
logger.info("Error in processing", ex);
}
finally {
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
}
});
}

View File

@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.logic.links.LinkGraph;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.*;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain;
@ -22,6 +23,7 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.*;
import java.util.regex.Pattern;
public class DomainProcessor {
private final DocumentProcessor documentProcessor;
@ -29,6 +31,7 @@ public class DomainProcessor {
private final AnchorTagsSource anchorTagsSource;
private final AnchorTextKeywords anchorTextKeywords;
private final LshDocumentDeduplicator documentDeduplicator;
private final GeoIpDictionary geoIpDictionary;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -37,13 +40,16 @@ public class DomainProcessor {
SiteWords siteWords,
AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
LshDocumentDeduplicator documentDeduplicator) throws SQLException
LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException
{
this.documentProcessor = documentProcessor;
this.siteWords = siteWords;
this.anchorTextKeywords = anchorTextKeywords;
this.documentDeduplicator = documentDeduplicator;
this.anchorTagsSource = anchorTagsSourceFactory.create();
this.geoIpDictionary = geoIpDictionary;
geoIpDictionary.waitReady();
}
@SneakyThrows
@ -54,11 +60,21 @@ public class DomainProcessor {
boolean cookies = false;
String ip = "";
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
DomainLinks externalDomainLinks = null;
while (dataStream.hasNext()) {
var data = dataStream.next();
// Do a lazy load of the external domain links since we don't know the domain
// until we see the first document
if (externalDomainLinks == null) {
var domain = data.getDomain();
if (domain != null) {
externalDomainLinks = anchorTagsSource.getAnchorTags(domain);
}
}
if (data instanceof CrawledDomain crawledDomain) {
ret.domain = new EdgeDomain(crawledDomain.domain);
ret.ip = crawledDomain.ip;
@ -76,8 +92,15 @@ public class DomainProcessor {
try {
if (doc.url == null)
continue;
fixBadCanonicalTag(doc);
// This case should never be reachable, as we should have initiated
// the externalDomainLinks variable above if we made it past the
// doc.url == null check; but we'll leave it here just in case
// to make debugging easier if we break this.
assert externalDomainLinks != null : "externalDomainLinks has not been initialized";
docs.add(documentProcessor.process(doc, externalDomainLinks));
}
catch (Exception ex) {
@ -89,11 +112,22 @@ public class DomainProcessor {
// Add late keywords and features from domain-level information
List<String> terms = new ArrayList<>();
terms.add("ip:"+ip);
String ipCountryCode = geoIpDictionary.getCountry(ip).toLowerCase();
if (!ipCountryCode.isBlank()) {
terms.add("ip:"+ipCountryCode);
}
if (cookies) {
terms.add(HtmlFeature.COOKIES.getKeyword());
}
if (isAcademicDomain(ret.domain)) {
terms.add("special:academia");
}
for (var document : ret.documents) {
if (document.details == null)
continue;
@ -114,6 +148,19 @@ public class DomainProcessor {
return ret;
}
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
private boolean isAcademicDomain(EdgeDomain domain) {
if (domain.domain.endsWith(".edu"))
return true;
if (academicPattern.matcher(domain.domain).matches())
return true;
return false;
}
private void fixBadCanonicalTag(CrawledDocument doc) {
// Some sites have a canonical tag that points to a different domain,
// but our loader can not support this, so we point these back to the

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload;
import com.google.gson.Gson;
import com.google.inject.Inject;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
@ -21,6 +22,7 @@ public class SideloadSourceFactory {
private final SideloaderProcessing sideloaderProcessing;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor documentKeywordExtractor;
private final AnchorTextKeywords anchorTextKeywords;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
private final WarcSideloadFactory warcSideloadFactory;
@ -29,7 +31,7 @@ public class SideloadSourceFactory {
public SideloadSourceFactory(Gson gson,
SideloaderProcessing sideloaderProcessing,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor documentKeywordExtractor,
DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords,
AnchorTagsSourceFactory anchorTagsSourceFactory,
DirtreeSideloaderFactory dirtreeSideloaderFactory,
WarcSideloadFactory warcSideloadFactory) {
@ -37,13 +39,14 @@ public class SideloadSourceFactory {
this.sideloaderProcessing = sideloaderProcessing;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.documentKeywordExtractor = documentKeywordExtractor;
this.anchorTextKeywords = anchorTextKeywords;
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
this.warcSideloadFactory = warcSideloadFactory;
}
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing);
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing);
}
public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException {

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.encyclopedia;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import lombok.SneakyThrows;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.DisqualifiedException;
@ -14,6 +15,7 @@ import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.util.ProcessingIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -27,15 +29,10 @@ import java.nio.file.Path;
import java.sql.*;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicBoolean;
/** This is an experimental sideloader for encyclopedia.marginalia.nu's database;
* (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code)
*
* <p>
* See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data
*/
public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable {
@ -43,6 +40,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
private final Connection connection;
private final EdgeUrl baseUrl;
private final Gson gson;
private final AnchorTextKeywords anchorTextKeywords;
private final SideloaderProcessing sideloaderProcessing;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class);
@ -51,9 +49,11 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
String baseUrl,
Gson gson,
AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
SideloaderProcessing sideloaderProcessing) throws SQLException {
this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new);
this.gson = gson;
this.anchorTextKeywords = anchorTextKeywords;
this.sideloaderProcessing = sideloaderProcessing;
String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString();
@ -76,62 +76,24 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
@SneakyThrows
@Override
public Iterator<ProcessedDocument> getDocumentsStream() {
LinkedBlockingQueue<ProcessedDocument> docs = new LinkedBlockingQueue<>(32);
AtomicBoolean isFinished = new AtomicBoolean(false);
return new ProcessingIterator<>(24, 16, (taskConsumer) -> {
DomainLinks domainLinks = getDomainLinks();
ExecutorService executorService = Executors.newFixedThreadPool(16);
Semaphore sem = new Semaphore(16);
var stmt = connection.prepareStatement("""
SELECT url,title,html FROM articles
""");
stmt.setFetchSize(100);
DomainLinks domainLinks = getDomainLinks();
var rs = stmt.executeQuery();
executorService.submit(() -> {
try {
var stmt = connection.prepareStatement("""
SELECT url,title,html FROM articles
""");
stmt.setFetchSize(100);
while (rs.next()) {
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
String title = rs.getString("title");
String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8);
var rs = stmt.executeQuery();
while (rs.next()) {
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
String title = rs.getString("title");
String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8);
sem.acquire();
executorService.submit(() -> {
try {
docs.add(convertDocument(articleParts.parts, title, url, domainLinks));
} catch (URISyntaxException | DisqualifiedException e) {
e.printStackTrace();
} finally {
sem.release();
}
});
}
stmt.close();
}
catch (Exception e) {
e.printStackTrace();
}
finally {
isFinished.set(true);
taskConsumer.accept(() -> convertDocument(articleParts.parts, title, url, domainLinks));
}
});
return new Iterator<>() {
@Override
public boolean hasNext() {
return !isFinished.get() || !docs.isEmpty() || sem.availablePermits() < 16;
}
@SneakyThrows
@Override
public ProcessedDocument next() {
return docs.take();
}
};
}
private DomainLinks getDomainLinks() {
@ -142,30 +104,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
logger.error("Failed to create anchor tags source", ex);
return new DomainLinks();
}
}
ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException {
var stmt = connection.prepareStatement("""
SELECT url,title,html
FROM articles
WHERE url=?
""");
stmt.setFetchSize(100);
stmt.setString(1, url);
var rs = stmt.executeQuery();
if (rs.next()) {
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
String title = rs.getString("title");
return convertDocument(articleParts.parts,
title,
URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8),
new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data.
);
}
return null;
}
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
@ -180,13 +118,22 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
}
fullHtml.append("</body></html>");
return sideloaderProcessing
var doc = sideloaderProcessing
.processDocument(fullUrl,
fullHtml.toString(),
List.of("encyclopedia", "wiki"),
domainLinks,
GeneratorType.WIKI,
10_000_000);
// Add anchor text keywords
if (doc.isProcessedFully()) {
doc.words.addAnchorTerms(
anchorTextKeywords.getAnchorTextKeywords(domainLinks, doc.url)
);
}
return doc;
}
private <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {

View File

@ -6,6 +6,7 @@ import lombok.SneakyThrows;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
@ -78,7 +79,7 @@ public class CrawlingThenConvertingIntegrationTest {
List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
}
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();

View File

@ -1,38 +1,12 @@
package nu.marginalia.converting.sideload.encyclopedia;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.ConverterModule;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.processor.ConverterDomainTypes;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.io.processed.DocumentRecordParquetFileReader;
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.processed.DocumentRecord;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.when;
class EncyclopediaMarginaliaNuSideloaderTest {
Path tempFile;
@ -62,93 +36,5 @@ class EncyclopediaMarginaliaNuSideloaderTest {
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L)));
System.out.printf("%64s\n", Long.toBinaryString(0x10L));
}
@Test
public void debugSpecificArticle() throws SQLException, IOException, URISyntaxException, DisqualifiedException {
Path pathToDbFile = Path.of("/home/vlofgren/Code/MarginaliaSearch/run/samples/articles.db");
if (!Files.exists(pathToDbFile)) {
// not really practical to ship a 40 Gb sqlite files on github
// be @vlofgren to run this test
return;
}
var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
var processing = Guice.createInjector(new ConverterModule(),
new AbstractModule() {
public void configure() {
bind(ConverterDomainTypes.class).toInstance(domainTypesMock);
}
}
)
.getInstance(SideloaderProcessing.class);
var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class);
when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks());
var sideloader = new EncyclopediaMarginaliaNuSideloader(
pathToDbFile,
"https://en.wikipedia.org/wiki/",
GsonFactory.get(),
atagsFactory,
processing
);
var document = sideloader.processJust("Don't_Tell_Me_(Madonna_song)");
System.out.println(document);
var keywordsBuilt = document.words.build();
var ptr = keywordsBuilt.newPointer();
Map<String, WordMetadata> dirtyAndBlues = new HashMap<>();
while (ptr.advancePointer()) {
String word = ptr.getKeyword();
System.out.println(word + ": " + Long.toHexString(Long.reverseBytes(ptr.getMetadata())));
if (Set.of("dirty", "blues").contains(word)) {
WordMetadata meta = new WordMetadata(ptr.getMetadata());
Assertions.assertNull(
dirtyAndBlues.put(word, meta)
);
}
}
Assertions.assertTrue(dirtyAndBlues.containsKey("dirty"));
Assertions.assertTrue(dirtyAndBlues.containsKey("blues"));
Assertions.assertNotEquals(
dirtyAndBlues.get("dirty"),
dirtyAndBlues.get("blues")
);
try (var dw = new DocumentRecordParquetFileWriter(tempFile)) {
dw.write(new DocumentRecord(
"encyclopedia.marginalia.nu",
document.url.toString(),
0,
document.state.toString(),
document.stateReason,
document.details.title,
document.details.description,
HtmlFeature.encode(document.details.features),
document.details.standard.name(),
document.details.length,
document.details.hashCode,
(float) document.details.quality,
document.details.metadata.encode(),
document.details.pubYear,
List.of(keywordsBuilt.keywords),
new TLongArrayList(keywordsBuilt.metadata)
));
}
var record = DocumentRecordParquetFileReader.streamKeywordsProjection(tempFile).findFirst().get();
String[] words = record.words.toArray(String[]::new);
long[] meta = record.metas.toArray();
assertArrayEquals(keywordsBuilt.keywords, words);
assertArrayEquals(keywordsBuilt.metadata, meta);
}
}

View File

@ -0,0 +1,38 @@
package nu.marginalia.util;
import org.junit.jupiter.api.Test;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class ProcessingIteratorTest {
@Test
public void test() {
Set<Integer> output = new HashSet<>();
var iter = new ProcessingIterator<Integer>(2, 2, q -> {
for (int i = 0; i < 10_000; i++) {
int j = i;
q.accept(() -> task(j));
}
});
while (iter.hasNext()) {
output.add(iter.next());
}
assertEquals(10_000, output.size());
for (int i = 0; i < 10_000; i++) {
assertTrue(output.contains(i));
}
}
int task(int n) throws InterruptedException {
TimeUnit.NANOSECONDS.sleep(10);
return n;
}
}

View File

@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.spec.CrawlSpecProvider;
@ -51,6 +52,7 @@ public class CrawlerMain {
private final ProcessHeartbeatImpl heartbeat;
private final MessageQueueFactory messageQueueFactory;
private final DomainProber domainProber;
private final FileStorageService fileStorageService;
private final DbCrawlSpecProvider dbCrawlSpecProvider;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
@ -70,7 +72,7 @@ public class CrawlerMain {
@Inject
public CrawlerMain(UserAgent userAgent,
ProcessHeartbeatImpl heartbeat,
MessageQueueFactory messageQueueFactory,
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
FileStorageService fileStorageService,
ProcessConfiguration processConfiguration,
DbCrawlSpecProvider dbCrawlSpecProvider,
@ -78,6 +80,7 @@ public class CrawlerMain {
Gson gson) {
this.heartbeat = heartbeat;
this.messageQueueFactory = messageQueueFactory;
this.domainProber = domainProber;
this.fileStorageService = fileStorageService;
this.dbCrawlSpecProvider = dbCrawlSpecProvider;
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
@ -211,14 +214,13 @@ public class CrawlerMain {
try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id);
var warcRecorder = new WarcRecorder(); // write to a temp file for now
var retreiver = new CrawlerRetreiver(fetcher, specification, warcRecorder, writer::accept);
var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder, writer::accept);
CrawlDataReference reference = getReference())
{
Thread.currentThread().setName("crawling:" + domain);
var domainLinks = anchorTagsSource.getAnchorTags(domain);
int size = retreiver.fetch(domainLinks, reference);
workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);

View File

@ -45,7 +45,7 @@ public class CrawlerRetreiver implements AutoCloseable {
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
private static final DomainProber domainProber = new DomainProber();
private final DomainProber domainProber;
private final SitemapRetriever sitemapRetriever;
private final DomainCrawlFrontier crawlFrontier;
private final WarcRecorder warcRecorder;
@ -59,12 +59,14 @@ public class CrawlerRetreiver implements AutoCloseable {
private static final String documentWasSameTag = "SAME-BY-COMPARISON";
public CrawlerRetreiver(HttpFetcher fetcher,
DomainProber domainProber,
CrawlSpecRecord specs,
WarcRecorder warcRecorder,
Consumer<SerializableCrawlData> writer)
{
this.warcRecorder = warcRecorder;
this.fetcher = fetcher;
this.domainProber = domainProber;
domain = specs.domain;

View File

@ -1,5 +1,7 @@
package nu.marginalia.crawl.retreival;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawling.model.CrawlerDomainStatus;
@ -11,17 +13,21 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.util.function.Predicate;
@Singleton
public class DomainProber {
private final Logger logger = LoggerFactory.getLogger(DomainProber.class);
private static IpBlockList ipBlockList;
private final Predicate<EdgeDomain> domainBlacklist;
static {
try {
ipBlockList = new IpBlockList(new GeoIpBlocklist());
} catch (Exception e) {
throw new RuntimeException(e);
}
@Inject
public DomainProber(IpBlockList ipBlockList) {
this.domainBlacklist = ipBlockList::isAllowed;
}
/** For testing */
public DomainProber(Predicate<EdgeDomain> domainBlacklist) {
this.domainBlacklist = domainBlacklist;
}
/** To detect problems early we do a probing request to the domain before we start crawling it properly.
@ -37,7 +43,7 @@ public class DomainProber {
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
}
if (!ipBlockList.isAllowed(firstUrlInQueue.domain))
if (!domainBlacklist.test(firstUrlInQueue.domain))
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
@ -62,7 +68,7 @@ public class DomainProber {
/** This domain redirects to another domain */
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
/** If the retreivala of the probed url was successful, return the url as it was fetched
/** If the retrieval of the probed url was successful, return the url as it was fetched
* (which may be different from the url we probed, if we attempted another URL schema).
*
* @param probedUrl The url we successfully probed

View File

@ -90,7 +90,10 @@ public class HttpFetcherImpl implements HttpFetcher {
}
@Inject
public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
public HttpFetcherImpl(@Named("user-agent") String userAgent,
Dispatcher dispatcher,
ConnectionPool connectionPool)
{
this.client = createClient(dispatcher, connectionPool);
this.userAgent = userAgent;
this.contentTypeProber = new ContentTypeProber(userAgent, client);

View File

@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival;
import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.*;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.model.CrawledDocument;
@ -65,7 +66,7 @@ public class CrawlerMockFetcherTest {
void crawl(CrawlSpecRecord spec, Consumer<SerializableCrawlData> consumer) throws IOException {
try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(fetcherMock, spec, recorder, consumer)
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder, consumer)
.fetch();
}
}

View File

@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
@ -59,7 +60,7 @@ class CrawlerRetreiverTest {
List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder(tempFile)) {
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
} catch (IOException ex) {
Assertions.fail(ex);
}
@ -103,7 +104,7 @@ class CrawlerRetreiverTest {
List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
}
catch (IOException ex) {
Assertions.fail(ex);
@ -137,7 +138,7 @@ class CrawlerRetreiverTest {
List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch();
}
catch (IOException ex) {
Assertions.fail(ex);
@ -178,7 +179,7 @@ class CrawlerRetreiverTest {
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, d -> {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> {
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
@ -202,7 +203,7 @@ class CrawlerRetreiverTest {
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(httpFetcher, specs, recorder, d -> {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> {
if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
}

View File

@ -9,6 +9,7 @@ import nu.marginalia.io.processed.DomainRecordParquetFileReader;
import nu.marginalia.loading.LoaderInputData;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.model.processed.DomainWithIp;
import nu.marginalia.process.control.ProcessHeartbeatImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -51,9 +52,9 @@ public class DomainLoaderService {
) {
try (var inserter = new DomainInserter(conn, nodeId)) {
for (var domain : readSetDomainNames(inputData)) {
inserter.accept(new EdgeDomain(domain));
domainNamesAll.add(domain);
for (var domainWithIp : readBasicDomainInformation(inputData)) {
inserter.accept(new EdgeDomain(domainWithIp.domain));
domainNamesAll.add(domainWithIp.domain);
}
}
try (var inserter = new DomainInserter(conn, -1)) {
@ -63,9 +64,9 @@ public class DomainLoaderService {
}
}
try (var updater = new DomainAffinityUpdater(conn, nodeId)) {
for (var domain : readSetDomainNames(inputData)) {
updater.accept(new EdgeDomain(domain));
try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId)) {
for (var domainWithIp : readBasicDomainInformation(inputData)) {
updater.accept(new EdgeDomain(domainWithIp.domain), domainWithIp.ip);
}
}
@ -84,15 +85,15 @@ public class DomainLoaderService {
return ret;
}
Collection<String> readSetDomainNames(LoaderInputData inputData) throws IOException {
final Set<String> domainNamesAll = new HashSet<>(100_000);
Collection<DomainWithIp> readBasicDomainInformation(LoaderInputData inputData) throws IOException {
final Set<DomainWithIp> domainsAll = new HashSet<>(100_000);
var domainFiles = inputData.listDomainFiles();
for (var file : domainFiles) {
domainNamesAll.addAll(DomainRecordParquetFileReader.getDomainNames(file));
domainsAll.addAll(DomainRecordParquetFileReader.getBasicDomainInformation(file));
}
return domainNamesAll;
return domainsAll;
}
Collection<String> readReferencedDomainNames(LoaderInputData inputData) throws IOException {
@ -164,20 +165,25 @@ public class DomainLoaderService {
statement.close();
}
}
private static class DomainAffinityUpdater implements AutoCloseable {
private static class DomainAffinityAndIpUpdater implements AutoCloseable {
private final PreparedStatement statement;
private final int nodeAffinity;
private int count = 0;
public DomainAffinityUpdater(Connection connection, int affinity) throws SQLException {
public DomainAffinityAndIpUpdater(Connection connection, int affinity) throws SQLException {
this.nodeAffinity = affinity;
statement = connection.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE DOMAIN_NAME=?");
statement = connection.prepareStatement("""
UPDATE EC_DOMAIN
SET NODE_AFFINITY = ?, IP = ?
WHERE DOMAIN_NAME=?
""");
}
public void accept(EdgeDomain domain) throws SQLException {
public void accept(EdgeDomain domain, String ip) throws SQLException {
statement.setInt(1, nodeAffinity);
statement.setString(2, domain.toString());
statement.setString(2, ip);
statement.setString(3, domain.toString());
statement.addBatch();
if (++count > 1000) {

View File

@ -6,7 +6,6 @@ import nu.marginalia.ProcessConfiguration;
import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter;
import nu.marginalia.io.processed.DomainRecordParquetFileWriter;
import nu.marginalia.io.processed.ProcessedDataFileNames;
import nu.marginalia.loader.DbTestUtil;
import nu.marginalia.loading.LoaderInputData;
import nu.marginalia.model.processed.DomainLinkRecord;
import nu.marginalia.model.processed.DomainRecord;
@ -21,10 +20,8 @@ import org.testcontainers.junit.jupiter.Testcontainers;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.junit.jupiter.api.Assertions.*;
@ -99,7 +96,7 @@ class DomainLoaderServiceTest {
// Verify
Set<String> expectedDomains1 = Sets.union(new HashSet<>(domains1), new HashSet<>(domains2));
assertEquals(expectedDomains1, domainService.readSetDomainNames(new LoaderInputData(workDir, 2)));
assertEquals(expectedDomains1, domainService.readBasicDomainInformation(new LoaderInputData(workDir, 2)).stream().map(d -> d.domain).collect(Collectors.toSet()));
Set<String> expectedDomains2 = new HashSet<>(linkDomains);
assertEquals(expectedDomains2, domainService.readReferencedDomainNames(new LoaderInputData(workDir, 2)));

View File

@ -10,8 +10,6 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.query.client.QueryClient;
import nu.marginalia.query.model.QueryParams;
@ -64,7 +62,7 @@ public class ApiSearchOperator {
return switch (index) {
case 0 -> SearchSetIdentifier.NONE;
case 1 -> SearchSetIdentifier.SMALLWEB;
case 2 -> SearchSetIdentifier.RETRO;
case 2 -> SearchSetIdentifier.POPULAR;
case 3 -> SearchSetIdentifier.NONE;
case 5 -> SearchSetIdentifier.NONE;
default -> SearchSetIdentifier.NONE;

View File

@ -31,7 +31,6 @@ public class SearchService extends Service {
SearchFrontPageService frontPageService,
SearchErrorPageService errorPageService,
SearchAddToCrawlQueueService addToCrawlQueueService,
SearchFlagSiteService flagSiteService,
SearchSiteInfoService siteInfoService,
SearchQueryService searchQueryService
) {

View File

@ -1,28 +0,0 @@
package nu.marginalia.search.model;
import lombok.*;
import nu.marginalia.model.EdgeDomain;
import java.util.List;
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
@ToString
public class DomainInformation {
EdgeDomain domain;
boolean blacklisted;
int pagesKnown;
int pagesFetched;
int pagesIndexed;
int incomingLinks;
int outboundLinks;
int nodeAffinity;
double ranking;
boolean suggestForCrawling;
boolean inCrawlQueue;
boolean unknownDomain;
String state;
List<EdgeDomain> linkingDomains;
}

View File

@ -5,7 +5,6 @@ import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.command.SearchAdtechParameter;
import nu.marginalia.search.command.SearchJsParameter;
import nu.marginalia.search.command.SearchParameters;
import org.apache.regexp.RE;
import java.util.List;
@ -37,7 +36,7 @@ public class SearchFilters {
filterGroups = List.of(
List.of(
new Filter("No Filter", SearchProfile.NO_FILTER, parameters),
new Filter("Popular", SearchProfile.DEFAULT, parameters),
new Filter("Popular", SearchProfile.POPULAR, parameters),
new Filter("Small Web", SearchProfile.SMALLWEB, parameters),
new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters),
new Filter("Academia", SearchProfile.ACADEMIA, parameters)

View File

@ -8,19 +8,16 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import java.util.Objects;
public enum SearchProfile {
DEFAULT("default", SearchSetIdentifier.RETRO),
POPULAR("default", SearchSetIdentifier.POPULAR),
SMALLWEB("modern", SearchSetIdentifier.SMALLWEB),
BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS),
NO_FILTER("corpo", SearchSetIdentifier.NONE),
YOLO("yolo", SearchSetIdentifier.NONE),
VINTAGE("vintage", SearchSetIdentifier.NONE),
TILDE("tilde", SearchSetIdentifier.NONE),
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
ACADEMIA("academia", SearchSetIdentifier.ACADEMIA),
ACADEMIA("academia", SearchSetIdentifier.NONE),
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
FOOD("food", SearchSetIdentifier.NONE),
CRAFTS("crafts", SearchSetIdentifier.NONE),
CLASSICS("classics", SearchSetIdentifier.NONE),
FOOD("food", SearchSetIdentifier.POPULAR),
FORUM("forum", SearchSetIdentifier.NONE),
WIKI("wiki", SearchSetIdentifier.NONE),
DOCS("docs", SearchSetIdentifier.NONE),
@ -38,7 +35,7 @@ public enum SearchProfile {
private final static SearchProfile[] values = values();
public static SearchProfile getSearchProfile(String param) {
if (null == param) {
return NO_FILTER;
return POPULAR;
}
for (var profile : values) {
@ -47,12 +44,12 @@ public enum SearchProfile {
}
}
return NO_FILTER;
return POPULAR;
}
public void addTacitTerms(SearchSubquery subquery) {
if (this == ACADEMIA) {
subquery.searchTermsPriority.add("tld:edu");
subquery.searchTermsAdvice.add("special:academia");
}
if (this == VINTAGE) {
subquery.searchTermsPriority.add("format:html123");
@ -75,9 +72,7 @@ public enum SearchProfile {
}
if (this == FOOD) {
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
}
if (this == CRAFTS) {
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword());
subquery.searchTermsExclude.add("special:ads");
}
}
@ -106,13 +101,5 @@ public enum SearchProfile {
else return SpecificationLimit.none();
}
public String getNearDomain() {
if (this == CLASSICS) {
return "classics.mit.edu";
}
return null;
}
}

View File

@ -1,276 +0,0 @@
package nu.marginalia.search.siteinfo;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.search.model.DomainInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.sql.SQLException;
import java.util.*;
/*
TODO: This class needs to be refactored, a lot of
these SQL queries are redundant and can be
collapsed into one single query that fetches
all the information
*/
@Singleton
public class DomainInformationService {
private DbDomainQueries dbDomainQueries;
private HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public DomainInformationService(
DbDomainQueries dbDomainQueries,
HikariDataSource dataSource) {
this.dbDomainQueries = dbDomainQueries;
this.dataSource = dataSource;
}
public Optional<DomainInformation> domainInfo(String site) {
OptionalInt maybeDomainId = getDomainFromPartial(site);
if (maybeDomainId.isEmpty()) {
return Optional.empty();
}
int domainId = maybeDomainId.getAsInt();
Optional<EdgeDomain> domain = dbDomainQueries.getDomain(domainId);
if (domain.isEmpty()) {
return Optional.empty();
}
boolean blacklisted = isBlacklisted(domain.get());
int pagesKnown = getPagesKnown(domainId);
int pagesVisited = getPagesVisited(domainId);
int pagesIndexed = getPagesIndexed(domainId);
int incomingLinks = getIncomingLinks(domainId);
int outboundLinks = getOutboundLinks(domainId);
int nodeAffinity = getNodeAffinity(domainId);
boolean inCrawlQueue = inCrawlQueue(domainId);
double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100;
DomainIndexingState state = getDomainState(domainId);
List<EdgeDomain> linkingDomains = getLinkingDomains(domainId);
var di = DomainInformation.builder()
.domain(domain.get())
.blacklisted(blacklisted)
.pagesKnown(pagesKnown)
.pagesFetched(pagesVisited)
.pagesIndexed(pagesIndexed)
.incomingLinks(incomingLinks)
.outboundLinks(outboundLinks)
.ranking(rank)
.state(state.desc)
.linkingDomains(linkingDomains)
.inCrawlQueue(inCrawlQueue)
.nodeAffinity(nodeAffinity)
.suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue))
.build();
return Optional.of(di);
}
private int getNodeAffinity(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("""
SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE ID=?
""")) {
stmt.setInt(1, domainId);
var rs = stmt.executeQuery();
if (rs.next())
return rs.getInt(1);
}
}
catch (SQLException ex) {
logger.error("SQL error", ex);
}
return -1;
}
@SneakyThrows
private boolean inCrawlQueue(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
SELECT 1 FROM CRAWL_QUEUE
INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
WHERE EC_DOMAIN.ID=?
"""))
{
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
return rsp.next();
}
}
}
private OptionalInt getDomainFromPartial(String site) {
return dbDomainQueries.tryGetDomainId(new EdgeDomain(site));
}
@SneakyThrows
public boolean isBlacklisted(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN IN (?,?)")) {
stmt.setString(1, domain.domain);
stmt.setString(2, domain.toString());
var rsp = stmt.executeQuery();
return rsp.next();
}
}
}
@SneakyThrows
public int getPagesKnown(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getPagesVisited(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getPagesIndexed(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getIncomingLinks(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getOutboundLinks(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
public DomainIndexingState getDomainState(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return DomainIndexingState.valueOf(rsp.getString(1));
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return DomainIndexingState.ERROR;
}
public List<EdgeDomain> getLinkingDomains(int domainId) {
try (var connection = dataSource.getConnection()) {
List<EdgeDomain> results = new ArrayList<>(25);
try (var stmt = connection.prepareStatement("SELECT SOURCE_DOMAIN FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeDomain(rsp.getString(1)));
}
return results;
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return Collections.emptyList();
}
public double getRank(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return 1;
}
}

View File

@ -1,16 +1,16 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.assistant.client.AssistantClient;
import nu.marginalia.assistant.client.model.SimilarDomain;
import nu.marginalia.client.Context;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.SearchOperator;
import nu.marginalia.search.model.DomainInformation;
import nu.marginalia.assistant.client.model.DomainInformation;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.siteinfo.DomainInformationService;
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
import spark.Request;
import spark.Response;
@ -23,22 +23,19 @@ import java.util.Map;
public class SearchSiteInfoService {
private final SearchOperator searchOperator;
private final SimilarDomainsService similarDomains;
private final DomainInformationService domainInformationService;
private final AssistantClient assistantClient;
private final SearchFlagSiteService flagSiteService;
private final DbDomainQueries domainQueries;
private final MustacheRenderer<Object> renderer;
@Inject
public SearchSiteInfoService(SearchOperator searchOperator,
SimilarDomainsService similarDomains,
DomainInformationService domainInformationService,
AssistantClient assistantClient,
RendererFactory rendererFactory,
SearchFlagSiteService flagSiteService,
DbDomainQueries domainQueries) throws IOException {
this.searchOperator = searchOperator;
this.similarDomains = similarDomains;
this.domainInformationService = domainInformationService;
this.assistantClient = assistantClient;
this.flagSiteService = flagSiteService;
this.domainQueries = domainQueries;
@ -108,13 +105,6 @@ public class SearchSiteInfoService {
false);
}
private DomainInformation dummyInformation(String domainName) {
return DomainInformation.builder()
.domain(new EdgeDomain(domainName))
.suggestForCrawling(true)
.unknownDomain(true)
.build();
}
private Backlinks listLinks(Context ctx, String domainName) {
return new Backlinks(domainName,
@ -126,13 +116,24 @@ public class SearchSiteInfoService {
final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
final DomainInformation domainInfo = domainInformationService.domainInfo(domainName)
.orElseGet(() -> dummyInformation(domainName));
final DomainInformation domainInfo;
final List<SimilarDomain> similarSet;
final List<SimilarDomain> linkingDomains;
final List<SimilarDomainsService.SimilarDomain> similarSet =
similarDomains.getSimilarDomains(domainId, 100);
final List<SimilarDomainsService.SimilarDomain> linkingDomains =
similarDomains.getLinkingDomains(domainId, 100);
if (domainId < 0 || !assistantClient.isAccepting()) {
domainInfo = createDummySiteInfo(domainName);
similarSet = List.of();
linkingDomains = List.of();
}
else {
domainInfo = assistantClient.domainInformation(ctx, domainId).blockingFirst();
similarSet = assistantClient
.similarDomains(ctx, domainId, 100)
.blockingFirst();
linkingDomains = assistantClient
.linkedDomains(ctx, domainId, 100)
.blockingFirst();
}
return new SiteInfoWithContext(domainName,
domainId,
@ -141,6 +142,15 @@ public class SearchSiteInfoService {
linkingDomains
);
}
private DomainInformation createDummySiteInfo(String domainName) {
return DomainInformation.builder()
.domain(new EdgeDomain(domainName))
.suggestForCrawling(true)
.unknownDomain(true)
.build();
}
private Docs listDocs(Context ctx, String domainName) {
return new Docs(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
@ -181,13 +191,13 @@ public class SearchSiteInfoService {
String domain,
long domainId,
DomainInformation domainInformation,
List<SimilarDomainsService.SimilarDomain> similar,
List<SimilarDomainsService.SimilarDomain> linking) {
List<SimilarDomain> similar,
List<SimilarDomain> linking) {
public SiteInfoWithContext(String domain,
long domainId,
DomainInformation domainInformation,
List<SimilarDomainsService.SimilarDomain> similar,
List<SimilarDomainsService.SimilarDomain> linking
List<SimilarDomain> similar,
List<SimilarDomain> linking
)
{
this(Map.of("info", true),

View File

@ -99,7 +99,6 @@
</section>
<section id="legal">
<h1>Policies</h1>
This website complies with the GDPR by <em>not collecting any personal
information</em>, and with the EU Cookie Directive by <em>not using
cookies</em>. <a href="https://memex.marginalia.nu/projects/edge/privacy.gmi">More Information</a>.
@ -109,8 +108,13 @@
<h1> Open Source </h1>
The search engine is open source with an AGPL license. The sources can be perused at
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
<h1>Data Sources</h1>
IP geolocation is sourced from the IP2Location LITE data available from
<a rel="external noopener nofollow" href="https://lite.ip2location.com/">https://lite.ip2location.com/</a>
under
<a rel="external noopener nofollow" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA&nbsp;4.0</a>.
</section>
</footer>
<script src="/tts.js"></script>

View File

@ -6,5 +6,6 @@
Pages Known: {{pagesKnown}} <br/>
Pages Crawled: {{pagesFetched}} <br/>
Pages Indexed: {{pagesIndexed}} <br/>
IP: {{ip}} {{#if ipCountry}}<span title="{{ipCountry}}">{{getIpFlag}}</span>{{/if}}<br/>
</fieldset>
<br/>

View File

@ -27,11 +27,13 @@ dependencies {
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:features-search:screenshots')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')

View File

@ -3,6 +3,8 @@ package nu.marginalia.assistant;
import com.google.gson.Gson;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.assistant.domains.DomainInformationService;
import nu.marginalia.assistant.domains.SimilarDomainsService;
import nu.marginalia.assistant.eval.Units;
import nu.marginalia.assistant.suggest.Suggestions;
import nu.marginalia.assistant.eval.MathParser;
@ -16,11 +18,16 @@ import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.ArrayList;
import java.util.Objects;
public class AssistantService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = GsonFactory.get();
private final Units units;
private final MathParser mathParser;
private final SimilarDomainsService similarDomainsService;
private final DomainInformationService domainInformationService;
private final Suggestions suggestions;
@SneakyThrows
@ -30,12 +37,16 @@ public class AssistantService extends Service {
MathParser mathParser,
Units units,
ScreenshotService screenshotService,
SimilarDomainsService similarDomainsService,
DomainInformationService domainInformationService,
Suggestions suggestions)
{
super(params);
this.mathParser = mathParser;
this.units = units;
this.similarDomainsService = similarDomainsService;
this.domainInformationService = domainInformationService;
this.suggestions = suggestions;
Spark.staticFiles.expireTime(600);
@ -56,12 +67,50 @@ public class AssistantService extends Service {
rsp,
req.queryParams("value")
));
Spark.get("/domain/:id/similar", this::getSimilarDomains, this::convertToJson);
Spark.get("/domain/:id/linking", this::getLinkingDomains, this::convertToJson);
Spark.get("/domain/:id/info", this::getDomainInformation, this::convertToJson);
Spark.get("/public/suggest/", this::getSuggestions, this::convertToJson);
Spark.awaitInitialization();
}
private Object getSimilarDomains(Request request, Response response) {
int domainId = Integer.parseInt(request.params("id"));
int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25"));
response.type("application/json");
if (!similarDomainsService.isReady()) {
return new ArrayList<>();
}
return similarDomainsService.getSimilarDomains(domainId, count);
}
private Object getLinkingDomains(Request request, Response response) {
int domainId = Integer.parseInt(request.params("id"));
int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25"));
response.type("application/json");
if (!similarDomainsService.isReady()) {
return new ArrayList<>();
}
return similarDomainsService.getLinkingDomains(domainId, count);
}
private Object getDomainInformation(Request request, Response response) {
int domainId = Integer.parseInt(request.params("id"));
response.type("application/json");
var maybeDomainInfo = domainInformationService.domainInfo(domainId);
if (maybeDomainInfo.isEmpty()) {
Spark.halt(404);
}
return maybeDomainInfo.get();
}
private Object getSuggestions(Request request, Response response) {
response.type("application/json");
var param = request.queryParams("partial");

View File

@ -0,0 +1,115 @@
package nu.marginalia.assistant.domains;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.assistant.client.model.DomainInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
@Singleton
public class DomainInformationService {
private final GeoIpDictionary geoIpDictionary;
private DbDomainQueries dbDomainQueries;
private HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public DomainInformationService(
DbDomainQueries dbDomainQueries,
GeoIpDictionary geoIpDictionary,
HikariDataSource dataSource) {
this.dbDomainQueries = dbDomainQueries;
this.geoIpDictionary = geoIpDictionary;
this.dataSource = dataSource;
}
public Optional<DomainInformation> domainInfo(int domainId) {
Optional<EdgeDomain> domain = dbDomainQueries.getDomain(domainId);
if (domain.isEmpty()) {
return Optional.empty();
}
var builder = DomainInformation.builder();
try (var connection = dataSource.getConnection();
var stmt = connection.createStatement();
) {
boolean inCrawlQueue;
int outboundLinks = 0;
int pagesVisited = 0;
ResultSet rs;
rs = stmt.executeQuery(STR."""
SELECT IP, NODE_AFFINITY, DOMAIN_NAME, STATE, IFNULL(RANK, 1) AS RANK
FROM EC_DOMAIN WHERE ID=\{domainId}
""");
if (rs.next()) {
String ip = rs.getString("IP");
builder.ip(ip);
builder.ipCountry(geoIpDictionary.getCountry(ip));
builder.nodeAffinity(rs.getInt("NODE_AFFINITY"));
builder.domain(new EdgeDomain(rs.getString("DOMAIN_NAME")));
builder.state(rs.getString("STATE"));
builder.ranking(Math.round(100.0*(1.0-rs.getDouble("RANK"))));
}
rs = stmt.executeQuery(STR."""
SELECT 1 FROM CRAWL_QUEUE
INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
WHERE EC_DOMAIN.ID=\{domainId}
""");
inCrawlQueue = rs.next();
builder.inCrawlQueue(inCrawlQueue);
rs = stmt.executeQuery(STR."""
SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=\{domainId}
""");
if (rs.next()) {
builder.incomingLinks(rs.getInt(1));
}
rs = stmt.executeQuery(STR."""
SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=\{domainId}
""");
if (rs.next()) {
builder.outboundLinks(rs.getInt(1));
outboundLinks = rs.getInt(1);
}
rs = stmt.executeQuery(STR."""
SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId}
""");
if (rs.next()) {
pagesVisited = rs.getInt("VISITED_URLS");
builder.pagesKnown(rs.getInt("KNOWN_URLS"));
builder.pagesIndexed(rs.getInt("GOOD_URLS"));
builder.pagesFetched(rs.getInt("VISITED_URLS"));
}
builder.suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue));
return Optional.of(builder.build());
}
catch (SQLException ex) {
logger.error("SQL error", ex);
return Optional.empty();
}
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.search.svc;
package nu.marginalia.assistant.domains;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
@ -6,11 +6,10 @@ import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntDoubleHashMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TLongDoubleHashMap;
import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.assistant.client.model.SimilarDomain;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -40,6 +39,8 @@ public class SimilarDomainsService {
public volatile double[] domainRanks = null;
public volatile String[] domainNames = null;
volatile boolean isReady = false;
@Inject
public SimilarDomainsService(HikariDataSource dataSource) {
this.dataSource = dataSource;
@ -167,6 +168,7 @@ public class SimilarDomainsService {
logger.info("Loaded {} domains", domainRanks.length);
logger.info("All done!");
isReady = true;
}
}
catch (SQLException throwables) {
@ -174,7 +176,11 @@ public class SimilarDomainsService {
}
}
double getRelatedness(int a, int b) {
public boolean isReady() {
return isReady;
}
private double getRelatedness(int a, int b) {
int lowerIndex = Math.min(domainIdToIdx.get(a), domainIdToIdx.get(b));
int higherIndex = Math.max(domainIdToIdx.get(a), domainIdToIdx.get(b));
@ -233,14 +239,14 @@ public class SimilarDomainsService {
indexedDomains.get(idx),
activeDomains.get(idx),
screenshotDomains.get(idx),
LinkType.find(
SimilarDomain.LinkType.find(
linkingIdsStoD.contains(idx),
linkingIdsDtoS.contains(idx)
)
));
}
domains.removeIf(d -> d.url.domain.toString().length() > 32);
domains.removeIf(d -> d.url().domain.toString().length() > 32);
return domains;
}
@ -319,84 +325,16 @@ public class SimilarDomainsService {
indexedDomains.get(idx),
activeDomains.get(idx),
screenshotDomains.get(idx),
LinkType.find(
SimilarDomain.LinkType.find(
linkingIdsStoD.contains(idx),
linkingIdsDtoS.contains(idx)
)
));
}
domains.removeIf(d -> d.url.domain.toString().length() > 32);
domains.removeIf(d -> d.url().domain.toString().length() > 32);
return domains;
}
public record SimilarDomain(EdgeUrl url,
int domainId,
double relatedness,
double rank,
boolean indexed,
boolean active,
boolean screenshot,
LinkType linkType)
{
public String getRankSymbols() {
if (rank > 90) {
return "&#9733;&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 70) {
return "&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 50) {
return "&#9733;&#9733;&#9733;";
}
if (rank > 30) {
return "&#9733;&#9733;";
}
if (rank > 10) {
return "&#9733;";
}
return "";
}
}
enum LinkType {
BACKWARD,
FOWARD,
BIDIRECTIONAL,
NONE;
public static LinkType find(boolean linkStod,
boolean linkDtos)
{
if (linkDtos && linkStod)
return BIDIRECTIONAL;
if (linkDtos)
return FOWARD;
if (linkStod)
return BACKWARD;
return NONE;
}
public String toString() {
return switch (this) {
case FOWARD -> "&#8594;";
case BACKWARD -> "&#8592;";
case BIDIRECTIONAL -> "&#8646;";
case NONE -> "-";
};
}
public String getDescription() {
return switch (this) {
case BACKWARD -> "Backward Link";
case FOWARD -> "Forward Link";
case BIDIRECTIONAL -> "Mutual Link";
case NONE -> "No Link";
};
}
};
}

View File

@ -20,8 +20,9 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
public class Suggestions {
private final PatriciaTrie<String> suggestionsTrie;
private final TermFrequencyDict termFrequencyDict;
private PatriciaTrie<String> suggestionsTrie = null;
private TermFrequencyDict termFrequencyDict = null;
private volatile boolean ready = false;
private final SpellChecker spellChecker;
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
@ -35,10 +36,12 @@ public class Suggestions {
) {
this.spellChecker = spellChecker;
suggestionsTrie = loadSuggestions(suggestionsFile);
termFrequencyDict = dict;
logger.info("Loaded {} suggestions", suggestionsTrie.size());
Thread.ofPlatform().start(() -> {
suggestionsTrie = loadSuggestions(suggestionsFile);
termFrequencyDict = dict;
ready = true;
logger.info("Loaded {} suggestions", suggestionsTrie.size());
});
}
private static PatriciaTrie<String> loadSuggestions(Path file) {
@ -71,6 +74,9 @@ public class Suggestions {
}
public List<String> getSuggestions(int count, String searchWord) {
if (!ready)
return Collections.emptyList();
if (searchWord.length() < MIN_SUGGEST_LENGTH) {
return Collections.emptyList();
}
@ -126,6 +132,9 @@ public class Suggestions {
public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
if (!ready)
return Stream.empty();
if (prefix.length() < MIN_SUGGEST_LENGTH) {
return Stream.empty();
}

View File

@ -25,7 +25,7 @@ public class RecrawlAllActor extends RecordActorPrototype {
public record Initial() implements ActorStep {}
public record WaitFinished(int node) implements ActorStep {}
public record WaitFinished(int node, long msgId) implements ActorStep {}
@Resume(behavior=ActorResumeBehavior.RETRY)
public record Trigger(int node) implements ActorStep {}
public record AdvanceNode(int node) implements ActorStep {}
@ -49,17 +49,18 @@ public class RecrawlAllActor extends RecordActorPrototype {
var data = new ExecutorRemoteActorFactory.CrawlData(activeFileStorage.get(0), true);
if (remoteActorFactory.createCrawlRemote(node).trigger(data)) {
yield new WaitFinished(node);
long msgId = remoteActorFactory.createCrawlRemote(node).trigger(data);
if (msgId >= 0) {
yield new WaitFinished(node, msgId);
}
else {
yield new AdvanceNode(node);
}
}
case WaitFinished(int node) -> {
case WaitFinished(int node, long msgId) -> {
var remoteActor = remoteActorFactory.createCrawlRemote(node);
for (;;) {
var state = remoteActor.getState();
var state = remoteActor.getState(msgId);
if ("END".equals(state) || "ERROR".equals(state)) {
break;
}
@ -80,8 +81,7 @@ public class RecrawlAllActor extends RecordActorPrototype {
public RecrawlAllActor(Gson gson,
ExecutorRemoteActorFactory remoteActorFactory,
FileStorageService fileStorageService,
PrecessionNodes precessionNodes,
NodeConfigurationService nodeConfigurationService)
PrecessionNodes precessionNodes)
{
super(gson);
this.remoteActorFactory = remoteActorFactory;

View File

@ -24,7 +24,7 @@ public class ReprocessAllActor extends RecordActorPrototype {
public record Initial() implements ActorStep {}
public record WaitFinished(int node) implements ActorStep {}
public record WaitFinished(int node, long msgId) implements ActorStep {}
@Resume(behavior=ActorResumeBehavior.RETRY)
public record Trigger(int node) implements ActorStep {}
public record AdvanceNode(int node) implements ActorStep {}
@ -47,17 +47,18 @@ public class ReprocessAllActor extends RecordActorPrototype {
var data = new ExecutorRemoteActorFactory.ConvertAndLoadData(activeFileStorage.get(0));
if (remoteActorFactory.createConvertAndLoadRemote(node).trigger(data)) {
yield new WaitFinished(node);
long msgId = remoteActorFactory.createConvertAndLoadRemote(node).trigger(data);
if (msgId >= 0) {
yield new WaitFinished(node, msgId);
}
else {
yield new AdvanceNode(node);
}
}
case WaitFinished(int node) -> {
case WaitFinished(int node, long msgId) -> {
var remoteActor = remoteActorFactory.createConvertAndLoadRemote(node);
for (;;) {
var state = remoteActor.getState();
var state = remoteActor.getState(msgId);
if ("END".equals(state) || "ERROR".equals(state))
break;
TimeUnit.SECONDS.sleep(10);

View File

@ -37,7 +37,7 @@ public class IndexSearchSetsService {
// Below are binary indices that are used to constrain a search
private volatile RankingSearchSet retroSet;
private volatile RankingSearchSet popularSet;
private volatile RankingSearchSet smallWebSet;
private volatile RankingSearchSet academiaSet;
private volatile RankingSearchSet blogsSet;
@ -72,7 +72,7 @@ public class IndexSearchSetsService {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, servicesFactory.getSearchSetsBase().resolve("popular.dat"));
blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat"));
}
@ -86,7 +86,7 @@ public class IndexSearchSetsService {
}
return switch (searchSetIdentifier) {
case NONE -> anySet;
case RETRO -> retroSet;
case POPULAR -> popularSet;
case ACADEMIA -> academiaSet;
case SMALLWEB -> smallWebSet;
case BLOGS -> blogsSet;
@ -95,7 +95,7 @@ public class IndexSearchSetsService {
enum RepartitionSteps {
UPDATE_ACADEMIA,
UPDATE_RETRO,
UPDATE_POPULAR,
UPDATE_SMALL_WEB,
UPDATE_BLOGS,
UPDATE_RANKINGS,
@ -107,8 +107,8 @@ public class IndexSearchSetsService {
processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA);
updateAcademiaDomainsSet();
processHeartbeat.progress(RepartitionSteps.UPDATE_RETRO);
updateRetroDomainsSet();
processHeartbeat.progress(RepartitionSteps.UPDATE_POPULAR);
updatePopularDomainsSet();
processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB);
updateSmallWebDomainsSet();
@ -139,15 +139,15 @@ public class IndexSearchSetsService {
}
@SneakyThrows
public void updateRetroDomainsSet() {
public void updatePopularDomainsSet() {
var entry = rankingSettings.retro;
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
synchronized (this) {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
retroSet.write();
popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, popularSet.source, data);
popularSet.write();
}
}

View File

@ -12,6 +12,7 @@ include 'code:services-application:dating-service'
include 'code:services-application:explorer-service'
include 'code:libraries:array'
include 'code:libraries:geo-ip'
include 'code:libraries:btree'
include 'code:libraries:easy-lsh'
include 'code:libraries:guarded-regex'