From fc30da0d480fd690e2eaf4ec1fd741b294fd9f12 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 8 Dec 2023 20:31:34 +0100 Subject: [PATCH 01/15] (converter) Add academia recognition to DomainProcessor The code now includes an additional function in the DomainProcessor class that checks if a domain is associated with academia. An academic domain is identified by the ".edu" TLD, or fits a specific regex pattern matching domains like *.ac.ccTld or *.edu.ccTld. If these conditions are met, the search term "special:academia" is added to the domain. The existing academia search filter uses personalized pagerank to select academia-adjacent domains, but it isn't working very well. The hope is that filtering on domain names will be more effective, and that it can supplant the ranking-based approach. --- .../converting/processor/DomainProcessor.java | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f5effebd..f9ae890c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; +import java.util.regex.Pattern; public class DomainProcessor { private final DocumentProcessor documentProcessor; @@ -94,6 +95,10 @@ public class DomainProcessor { terms.add(HtmlFeature.COOKIES.getKeyword()); } + if (isAcademicDomain(ret.domain)) { + terms.add("special:academia"); + } + for (var document : ret.documents) { if (document.details == null) continue; @@ -114,6 +119,19 @@ public class DomainProcessor { return ret; } + + private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$"); + private boolean isAcademicDomain(EdgeDomain domain) { + + if (domain.domain.endsWith(".edu")) + return true; + + if (academicPattern.matcher(domain.domain).matches()) + return true; + + return false; + } + private void fixBadCanonicalTag(CrawledDocument doc) { // Some sites have a canonical tag that points to a different domain, // but our loader can not support this, so we point these back to the From d0982e7ba5f709fc14aa2cf387e3a77324294feb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 12:33:39 +0100 Subject: [PATCH 02/15] (converter) Add error handling and lazy load external domain links The converter was not properly initiating the external links for each domain, causing an NPE in conversion. This needs to be loaded later since we don't know the domain we're processing until we've seen it in the crawl data. Also made some refactorings to make finding converter bugs easier, and finding the related domain less awkward from the SerializableCrawlData interface. --- .../crawling/model/CrawledDocument.java | 13 +++++++++++++ .../crawling/model/SerializableCrawlData.java | 1 + .../marginalia/converting/ConverterMain.java | 14 ++++++++++---- .../converting/processor/DomainProcessor.java | 19 ++++++++++++++++++- 4 files changed, 42 insertions(+), 5 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 94d13235..143c775b 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -4,6 +4,7 @@ import lombok.AllArgsConstructor; import lombok.Builder; import lombok.ToString; import nu.marginalia.bigstring.BigString; +import nu.marginalia.model.EdgeUrl; @Builder @AllArgsConstructor @@ -35,4 +36,16 @@ public class CrawledDocument implements SerializableCrawlData { return SERIAL_IDENTIFIER; } + @Override + public String getDomain() { + if (url == null) + return null; + + return EdgeUrl + .parse(url) + .map(EdgeUrl::getDomain) + .map(d -> d.domain) + .orElse(null); + } + } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java index c9804d54..48b3f65d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java @@ -2,4 +2,5 @@ package nu.marginalia.crawling.model; public interface SerializableCrawlData { String getSerialIdentifier(); + String getDomain(); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index fb919018..ebfb1bc2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -138,10 +138,16 @@ public class ConverterMain { for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id))) { pool.submit(() -> { - ProcessedDomain processed = processor.process(domain); - converterWriter.accept(processed); - - heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + try { + ProcessedDomain processed = processor.process(domain); + converterWriter.accept(processed); + } + catch (Exception ex) { + logger.info("Error in processing", ex); + } + finally { + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + } }); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f9ae890c..00a05257 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -55,11 +55,21 @@ public class DomainProcessor { boolean cookies = false; String ip = ""; - DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain); + DomainLinks externalDomainLinks = null; while (dataStream.hasNext()) { var data = dataStream.next(); + // Do a lazy load of the external domain links since we don't know the domain + // until we see the first document + if (externalDomainLinks == null) { + var domain = data.getDomain(); + + if (domain != null) { + externalDomainLinks = anchorTagsSource.getAnchorTags(domain); + } + } + if (data instanceof CrawledDomain crawledDomain) { ret.domain = new EdgeDomain(crawledDomain.domain); ret.ip = crawledDomain.ip; @@ -77,8 +87,15 @@ public class DomainProcessor { try { if (doc.url == null) continue; + fixBadCanonicalTag(doc); + // This case should never be reachable, as we should have initiated + // the externalDomainLinks variable above if we made it past the + // doc.url == null check; but we'll leave it here just in case + // to make debugging easier if we break this. + assert externalDomainLinks != null : "externalDomainLinks has not been initialized"; + docs.add(documentProcessor.process(doc, externalDomainLinks)); } catch (Exception ex) { From eccb12b366ebf6e1e31d081bb8d557ece48e55cb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 12:50:05 +0100 Subject: [PATCH 03/15] (control) Fix spurious state detection in control-side actors A race condition was found where precession actors would sometimes skip a step, because when invoking ExecutorRemoteActor.getState(), it would get the last 'OK' actor state from a previous run of the actor! To avoid this, the trigger method was changed from returning a boolean to the message ID, negative if an error occurred, to be passed to getState to select only messages that pertain to the present or future runs. --- .../client/ExecutorRemoteActorFactory.java | 33 ++++++++++++++----- .../mq/persistence/MqPersistence.java | 10 ++++-- .../actor/precession/RecrawlAllActor.java | 14 ++++---- .../actor/precession/ReprocessAllActor.java | 11 ++++--- 4 files changed, 44 insertions(+), 24 deletions(-) diff --git a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorRemoteActorFactory.java b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorRemoteActorFactory.java index 4e5cb8cb..ffbe168c 100644 --- a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorRemoteActorFactory.java +++ b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorRemoteActorFactory.java @@ -34,8 +34,23 @@ public class ExecutorRemoteActorFactory { } public interface ExecutorRemoteActorIf { - boolean trigger(T object) throws Exception; - String getState(); + + /** Trigger the remote actor with the given object. The object will be serialized to JSON and sent to the + * remote actor. If the remote actor does not respond after a time period, a timeout will occur and a negative + * message id will be returned. + * + * @param object The message to send to the remote actot + * @return The message id of the response message, or a negative number if the remote actor did not respond + * within a reasonable timeout seconds. + */ + long trigger(T object) throws Exception; + + /** Get the last finished state of the actor. + *

+ * The message id of the request initiating the actor must be provided to ensure that + * we don't get a state from a previous run. + */ + String getState(long fromMsgId); } public record CrawlData(FileStorageId storageId, boolean cascadeLoad) {} @@ -58,11 +73,11 @@ class ExecutorRemoteActor implements ExecutorRemoteActorFactory.ExecutorRemot this.triggerFunction = triggerFunction; } - public boolean trigger(T object) throws Exception { + public long trigger(T object) throws Exception { return trigger(gson.toJson(object)); } - public boolean trigger(String payload) throws Exception { + public long trigger(String payload) throws Exception { long id = persistence.sendNewMessage(inboxName, null, null, triggerFunction, payload, null); // Wait for the remote actor to respond to the message @@ -70,19 +85,19 @@ class ExecutorRemoteActor implements ExecutorRemoteActorFactory.ExecutorRemot for (int i = 0; i < 120; i++) { var msg = persistence.getMessage(id); if (msg.state() == MqMessageState.ACK || msg.state() == MqMessageState.OK) - return true; + return id; if (msg.state() == MqMessageState.ERR || msg.state() == MqMessageState.DEAD) - return false; + return -id; TimeUnit.SECONDS.sleep(1); } - return false; // Timeout + return -1; // Timeout } - public String getState() { + public String getState(long fromMsgId) { return persistence - .getHeadMessage(inboxName) + .getHeadMessage(inboxName, fromMsgId) .map(MqMessage::function) .orElse("INITIAL"); } diff --git a/code/libraries/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/libraries/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 8b4e9ac5..732aa70c 100644 --- a/code/libraries/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/libraries/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -496,17 +496,21 @@ public class MqPersistence { return gson; } - /** Returns the last message sent to this inbox with a state of 'OK' */ - public Optional getHeadMessage(String inboxName) { + /** Returns the last message sent to this inbox with a state of 'OK' + * with an id greater than or equal to fromMsgId + */ + public Optional getHeadMessage(String inboxName, long fromMsgId) { try (var conn = dataSource.getConnection(); var query = conn.prepareStatement(""" SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM MESSAGE_QUEUE - WHERE RECIPIENT_INBOX = ? AND STATE='OK' + WHERE RECIPIENT_INBOX = ? AND STATE='OK' AND ID >= ? ORDER BY ID DESC LIMIT 1 """)) { query.setString(1, inboxName); + query.setLong(2, fromMsgId); + var rs = query.executeQuery(); if (rs.next()) { long msgId = rs.getLong(1); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/RecrawlAllActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/RecrawlAllActor.java index a6733b63..e0268901 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/RecrawlAllActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/RecrawlAllActor.java @@ -25,7 +25,7 @@ public class RecrawlAllActor extends RecordActorPrototype { public record Initial() implements ActorStep {} - public record WaitFinished(int node) implements ActorStep {} + public record WaitFinished(int node, long msgId) implements ActorStep {} @Resume(behavior=ActorResumeBehavior.RETRY) public record Trigger(int node) implements ActorStep {} public record AdvanceNode(int node) implements ActorStep {} @@ -49,17 +49,18 @@ public class RecrawlAllActor extends RecordActorPrototype { var data = new ExecutorRemoteActorFactory.CrawlData(activeFileStorage.get(0), true); - if (remoteActorFactory.createCrawlRemote(node).trigger(data)) { - yield new WaitFinished(node); + long msgId = remoteActorFactory.createCrawlRemote(node).trigger(data); + if (msgId >= 0) { + yield new WaitFinished(node, msgId); } else { yield new AdvanceNode(node); } } - case WaitFinished(int node) -> { + case WaitFinished(int node, long msgId) -> { var remoteActor = remoteActorFactory.createCrawlRemote(node); for (;;) { - var state = remoteActor.getState(); + var state = remoteActor.getState(msgId); if ("END".equals(state) || "ERROR".equals(state)) { break; } @@ -80,8 +81,7 @@ public class RecrawlAllActor extends RecordActorPrototype { public RecrawlAllActor(Gson gson, ExecutorRemoteActorFactory remoteActorFactory, FileStorageService fileStorageService, - PrecessionNodes precessionNodes, - NodeConfigurationService nodeConfigurationService) + PrecessionNodes precessionNodes) { super(gson); this.remoteActorFactory = remoteActorFactory; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/ReprocessAllActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/ReprocessAllActor.java index 416f4d90..d37ecc2a 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/ReprocessAllActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/precession/ReprocessAllActor.java @@ -24,7 +24,7 @@ public class ReprocessAllActor extends RecordActorPrototype { public record Initial() implements ActorStep {} - public record WaitFinished(int node) implements ActorStep {} + public record WaitFinished(int node, long msgId) implements ActorStep {} @Resume(behavior=ActorResumeBehavior.RETRY) public record Trigger(int node) implements ActorStep {} public record AdvanceNode(int node) implements ActorStep {} @@ -47,17 +47,18 @@ public class ReprocessAllActor extends RecordActorPrototype { var data = new ExecutorRemoteActorFactory.ConvertAndLoadData(activeFileStorage.get(0)); - if (remoteActorFactory.createConvertAndLoadRemote(node).trigger(data)) { - yield new WaitFinished(node); + long msgId = remoteActorFactory.createConvertAndLoadRemote(node).trigger(data); + if (msgId >= 0) { + yield new WaitFinished(node, msgId); } else { yield new AdvanceNode(node); } } - case WaitFinished(int node) -> { + case WaitFinished(int node, long msgId) -> { var remoteActor = remoteActorFactory.createConvertAndLoadRemote(node); for (;;) { - var state = remoteActor.getState(); + var state = remoteActor.getState(msgId); if ("END".equals(state) || "ERROR".equals(state)) break; TimeUnit.SECONDS.sleep(10); From b6511fbfe29c30c1e418435f946d51e26144238a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 13:23:21 +0100 Subject: [PATCH 04/15] (converter) Add AnchorTextKeywords to EncyclopediaMarginaliaNuSideloader processing The commit updates EncyclopediaMarginaliaNuSideloader to include the AnchorTextKeywords in processing documents, aiding search result relevance. It also removes old test-related functionality and a large but fairly useless test previously used to debug a specific problem, to the detriment of the overall code quality. --- .../sideload/SideloadSourceFactory.java | 7 +- .../EncyclopediaMarginaliaNuSideloader.java | 45 +++---- ...ncyclopediaMarginaliaNuSideloaderTest.java | 114 ------------------ 3 files changed, 22 insertions(+), 144 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index debc460f..60f81d19 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload; import com.google.gson.Gson; import com.google.inject.Inject; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory; import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader; @@ -20,6 +21,7 @@ public class SideloadSourceFactory { private final SideloaderProcessing sideloaderProcessing; private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final DocumentKeywordExtractor documentKeywordExtractor; + private final AnchorTextKeywords anchorTextKeywords; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final DirtreeSideloaderFactory dirtreeSideloaderFactory; @@ -27,19 +29,20 @@ public class SideloadSourceFactory { public SideloadSourceFactory(Gson gson, SideloaderProcessing sideloaderProcessing, ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, - DocumentKeywordExtractor documentKeywordExtractor, + DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords, AnchorTagsSourceFactory anchorTagsSourceFactory, DirtreeSideloaderFactory dirtreeSideloaderFactory) { this.gson = gson; this.sideloaderProcessing = sideloaderProcessing; this.sentenceExtractorProvider = sentenceExtractorProvider; this.documentKeywordExtractor = documentKeywordExtractor; + this.anchorTextKeywords = anchorTextKeywords; this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.dirtreeSideloaderFactory = dirtreeSideloaderFactory; } public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException { - return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing); + return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing); } public Collection sideloadDirtree(Path pathToYamlFile) throws IOException { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index aab62ef9..fc1f5015 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.encyclopedia; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import lombok.SneakyThrows; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.DisqualifiedException; @@ -35,7 +36,7 @@ import java.util.concurrent.atomic.AtomicBoolean; /** This is an experimental sideloader for encyclopedia.marginalia.nu's database; * (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code) - * + *

* See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data */ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable { @@ -43,6 +44,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC private final Connection connection; private final EdgeUrl baseUrl; private final Gson gson; + private final AnchorTextKeywords anchorTextKeywords; private final SideloaderProcessing sideloaderProcessing; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class); @@ -51,9 +53,11 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC String baseUrl, Gson gson, AnchorTagsSourceFactory anchorTagsSourceFactory, + AnchorTextKeywords anchorTextKeywords, SideloaderProcessing sideloaderProcessing) throws SQLException { this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new); this.gson = gson; + this.anchorTextKeywords = anchorTextKeywords; this.sideloaderProcessing = sideloaderProcessing; String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString(); @@ -103,7 +107,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC try { docs.add(convertDocument(articleParts.parts, title, url, domainLinks)); } catch (URISyntaxException | DisqualifiedException e) { - e.printStackTrace(); + logger.warn("Problem converting encyclopedia article " + url, e); } finally { sem.release(); } @@ -113,7 +117,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC stmt.close(); } catch (Exception e) { - e.printStackTrace(); + logger.warn("Problem converting encyclopedia article", e); } finally { isFinished.set(true); @@ -142,30 +146,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC logger.error("Failed to create anchor tags source", ex); return new DomainLinks(); } - - } - - ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException { - var stmt = connection.prepareStatement(""" - SELECT url,title,html - FROM articles - WHERE url=? - """); - stmt.setFetchSize(100); - stmt.setString(1, url); - - var rs = stmt.executeQuery(); - if (rs.next()) { - var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); - String title = rs.getString("title"); - - return convertDocument(articleParts.parts, - title, - URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8), - new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data. - ); - } - return null; } private ProcessedDocument convertDocument(List parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException { @@ -180,13 +160,22 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC } fullHtml.append(""); - return sideloaderProcessing + var doc = sideloaderProcessing .processDocument(fullUrl, fullHtml.toString(), List.of("encyclopedia", "wiki"), domainLinks, GeneratorType.WIKI, 10_000_000); + + // Add anchor text keywords + if (doc.isProcessedFully()) { + doc.words.addAnchorTerms( + anchorTextKeywords.getAnchorTextKeywords(domainLinks, doc.url) + ); + } + + return doc; } private T fromCompressedJson(byte[] stream, Class type) throws IOException { diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java index 0b1b6904..fd4aa73a 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java @@ -1,38 +1,12 @@ package nu.marginalia.converting.sideload.encyclopedia; -import com.google.inject.AbstractModule; -import com.google.inject.Guice; -import gnu.trove.list.array.TLongArrayList; -import nu.marginalia.atags.source.AnchorTagsSourceFactory; -import nu.marginalia.atags.model.DomainLinks; -import nu.marginalia.converting.ConverterModule; -import nu.marginalia.converting.model.DisqualifiedException; -import nu.marginalia.converting.processor.ConverterDomainTypes; -import nu.marginalia.converting.sideload.SideloaderProcessing; -import nu.marginalia.io.processed.DocumentRecordParquetFileReader; -import nu.marginalia.io.processed.DocumentRecordParquetFileWriter; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.processed.DocumentRecord; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.mockito.Mockito; import java.io.IOException; -import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; -import java.sql.SQLException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static org.junit.jupiter.api.Assertions.*; -import static org.mockito.Mockito.when; class EncyclopediaMarginaliaNuSideloaderTest { Path tempFile; @@ -62,93 +36,5 @@ class EncyclopediaMarginaliaNuSideloaderTest { System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L))); System.out.printf("%64s\n", Long.toBinaryString(0x10L)); } - @Test - public void debugSpecificArticle() throws SQLException, IOException, URISyntaxException, DisqualifiedException { - Path pathToDbFile = Path.of("/home/vlofgren/Code/MarginaliaSearch/run/samples/articles.db"); - if (!Files.exists(pathToDbFile)) { - // not really practical to ship a 40 Gb sqlite files on github - // be @vlofgren to run this test - return; - } - var domainTypesMock = Mockito.mock(ConverterDomainTypes.class); - when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false); - var processing = Guice.createInjector(new ConverterModule(), - new AbstractModule() { - public void configure() { - bind(ConverterDomainTypes.class).toInstance(domainTypesMock); - } - } - ) - .getInstance(SideloaderProcessing.class); - var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class); - when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks()); - - var sideloader = new EncyclopediaMarginaliaNuSideloader( - pathToDbFile, - "https://en.wikipedia.org/wiki/", - GsonFactory.get(), - atagsFactory, - processing - ); - - var document = sideloader.processJust("Don't_Tell_Me_(Madonna_song)"); - - System.out.println(document); - - var keywordsBuilt = document.words.build(); - - var ptr = keywordsBuilt.newPointer(); - - Map dirtyAndBlues = new HashMap<>(); - - while (ptr.advancePointer()) { - String word = ptr.getKeyword(); - - System.out.println(word + ": " + Long.toHexString(Long.reverseBytes(ptr.getMetadata()))); - - if (Set.of("dirty", "blues").contains(word)) { - WordMetadata meta = new WordMetadata(ptr.getMetadata()); - - Assertions.assertNull( - dirtyAndBlues.put(word, meta) - ); - } - } - - Assertions.assertTrue(dirtyAndBlues.containsKey("dirty")); - Assertions.assertTrue(dirtyAndBlues.containsKey("blues")); - Assertions.assertNotEquals( - dirtyAndBlues.get("dirty"), - dirtyAndBlues.get("blues") - ); - - try (var dw = new DocumentRecordParquetFileWriter(tempFile)) { - dw.write(new DocumentRecord( - "encyclopedia.marginalia.nu", - document.url.toString(), - 0, - document.state.toString(), - document.stateReason, - document.details.title, - document.details.description, - HtmlFeature.encode(document.details.features), - document.details.standard.name(), - document.details.length, - document.details.hashCode, - (float) document.details.quality, - document.details.metadata.encode(), - document.details.pubYear, - List.of(keywordsBuilt.keywords), - new TLongArrayList(keywordsBuilt.metadata) - )); - } - - var record = DocumentRecordParquetFileReader.streamKeywordsProjection(tempFile).findFirst().get(); - String[] words = record.words.toArray(String[]::new); - long[] meta = record.metas.toArray(); - - assertArrayEquals(keywordsBuilt.keywords, words); - assertArrayEquals(keywordsBuilt.metadata, meta); - } } \ No newline at end of file From 5c46af0edb5ab077b9ad14b777f4574380e0e9a8 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 15:20:31 +0100 Subject: [PATCH 05/15] (converter) Refactor EncyclopediaMarginaliaNuSideloader to use ProcessingIterator Refactored the getDocumentsStream method in EncyclopediaMarginaliaNuSideloader to use the newly extracted ProcessingIterator class that encapsulates processing a stream of results from e.g a database query in parallel and returning the computed results as an iterator. The iterator was also improved on to be more reliable, previous versions of the logic would sometimes deadlock due to false positives in hasMore(). --- .../marginalia/util/ProcessingIterator.java | 139 ++++++++++++++++++ .../EncyclopediaMarginaliaNuSideloader.java | 68 ++------- .../util/ProcessingIteratorTest.java | 38 +++++ 3 files changed, 190 insertions(+), 55 deletions(-) create mode 100644 code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java create mode 100644 code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java diff --git a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java new file mode 100644 index 00000000..15dbc087 --- /dev/null +++ b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java @@ -0,0 +1,139 @@ +package nu.marginalia.util; + +import lombok.SneakyThrows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Consumer; + +/** + * Abstraction for exposing a (typically) read-from-disk -> parallel processing -> sequential output + * workflow as an iterator, where the number of tasks is much larger than the number of cores + */ +public class ProcessingIterator implements Iterator { + private static final Logger logger = LoggerFactory.getLogger(ProcessingIterator.class); + + private final LinkedBlockingQueue queue; + private final AtomicBoolean isFinished = new AtomicBoolean(false); + private final ExecutorService executorService; + private final Semaphore sem; + + private T next = null; + + private final int parallelism; + + public ProcessingIterator(int queueSize, int parallelism, ProcessingJob task) { + this.parallelism = parallelism; + + queue = new LinkedBlockingQueue<>(queueSize); + executorService = Executors.newFixedThreadPool(parallelism); + sem = new Semaphore(parallelism); + + executorService.submit(() -> executeJob(task)); + } + + private void executeJob(ProcessingJob job) { + try { + job.run(this::executeTask); + } catch (Exception e) { + logger.warn("Exception while processing", e); + } finally { + isFinished.set(true); + } + } + + private void executeTask(Task task) { + try { + sem.acquire(); + } catch (InterruptedException e) { + return; + } + + try { + queue.put(task.get()); + } catch (Exception e) { + logger.warn("Exception while processing", e); + } finally { + sem.release(); + } + } + + /** Returns true if there are more documents to be processed. + * This method may block until we are certain this is true. + *

+ * This method must be invoked from the same thread that invokes next(), + * (or synchronize between the two) + */ + @Override + @SneakyThrows + public boolean hasNext() { + if (next != null) + return true; + + do { + next = queue.poll(1, TimeUnit.SECONDS); + if (next != null) { + return true; + } + } while (expectMore()); + + if (!executorService.isShutdown()) { + executorService.shutdown(); + } + + return false; + } + + /** Heuristic for if we should expect more documents to be processed, + * _trust but verify_ since we don't run this in an exclusive section + * and may get a false positive. We never expect a false negative though. + */ + private boolean expectMore() { + return !isFinished.get() // we are still reading from the database + || !queue.isEmpty() // ... or we have documents in the queue + || sem.availablePermits() < parallelism; // ... or we are still processing documents + } + + /** Returns the next document to be processed. + * This method may block until we are certain there is a document to be processed. + *

+ * This method must be invoked from the same thread that invokes hasNext(), + * (or synchronize between the two) + *

+ * If this is run after hasNext() returns false, a NoSuchElementException is thrown. + */ + @SneakyThrows + @Override + public T next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + try { + return next; + } + finally { + next = null; + } + } + + /** + * A job that produces a sequence of processing tasks that are to be + * performed in parallel + */ + public interface ProcessingJob { + void run(Consumer> output) throws Exception; + } + + /** + * A single task that produces a result to be iterable via the Iterator interface + * (along with other tasks' outputs) + */ + public interface Task { + T get() throws Exception; + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index fc1f5015..204aa6a8 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -15,6 +15,7 @@ import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.util.ProcessingIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -28,11 +29,6 @@ import java.nio.file.Path; import java.sql.*; import java.util.Iterator; import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.Semaphore; -import java.util.concurrent.atomic.AtomicBoolean; /** This is an experimental sideloader for encyclopedia.marginalia.nu's database; * (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code) @@ -80,62 +76,24 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC @SneakyThrows @Override public Iterator getDocumentsStream() { - LinkedBlockingQueue docs = new LinkedBlockingQueue<>(32); - AtomicBoolean isFinished = new AtomicBoolean(false); + return new ProcessingIterator<>(24, 16, (taskConsumer) -> { + DomainLinks domainLinks = getDomainLinks(); - ExecutorService executorService = Executors.newFixedThreadPool(16); - Semaphore sem = new Semaphore(16); + var stmt = connection.prepareStatement(""" + SELECT url,title,html FROM articles + """); + stmt.setFetchSize(100); - DomainLinks domainLinks = getDomainLinks(); + var rs = stmt.executeQuery(); - executorService.submit(() -> { - try { - var stmt = connection.prepareStatement(""" - SELECT url,title,html FROM articles - """); - stmt.setFetchSize(100); + while (rs.next()) { + var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); + String title = rs.getString("title"); + String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); - var rs = stmt.executeQuery(); - while (rs.next()) { - var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); - String title = rs.getString("title"); - String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); - - sem.acquire(); - - executorService.submit(() -> { - try { - docs.add(convertDocument(articleParts.parts, title, url, domainLinks)); - } catch (URISyntaxException | DisqualifiedException e) { - logger.warn("Problem converting encyclopedia article " + url, e); - } finally { - sem.release(); - } - }); - } - - stmt.close(); - } - catch (Exception e) { - logger.warn("Problem converting encyclopedia article", e); - } - finally { - isFinished.set(true); + taskConsumer.accept(() -> convertDocument(articleParts.parts, title, url, domainLinks)); } }); - - return new Iterator<>() { - @Override - public boolean hasNext() { - return !isFinished.get() || !docs.isEmpty() || sem.availablePermits() < 16; - } - - @SneakyThrows - @Override - public ProcessedDocument next() { - return docs.take(); - } - }; } private DomainLinks getDomainLinks() { diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java new file mode 100644 index 00000000..d20b7ddf --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java @@ -0,0 +1,38 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Test; + +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class ProcessingIteratorTest { + + @Test + public void test() { + Set output = new HashSet<>(); + var iter = new ProcessingIterator(2, 2, q -> { + for (int i = 0; i < 10_000; i++) { + int j = i; + q.accept(() -> task(j)); + } + }); + while (iter.hasNext()) { + output.add(iter.next()); + } + + assertEquals(10_000, output.size()); + + for (int i = 0; i < 10_000; i++) { + assertTrue(output.contains(i)); + } + } + + int task(int n) throws InterruptedException { + TimeUnit.NANOSECONDS.sleep(10); + return n; + } +} \ No newline at end of file From 8ef34883a8278e2572929f7837eed0d5fdcec8da Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 16:29:35 +0100 Subject: [PATCH 06/15] (search) Move site information out of the search service and into assistant. This reduces the impact of restarting the search service, as the site information takes a few minutes to load during which it's not available. It also permits exposing this information via API in the future if there is interest in this. The assistant service was also modified to do a late load of the suggestions trie, as this is a major contributor to its start-up time. Finally, some changes were made to the client library, a new get() method was added that takes a TypeToken to allow deserialization of generics such as List, and the scheduler was also modified to use virtual threads. --- .../assistant/client/AssistantClient.java | 32 ++++++- .../client}/model/DomainInformation.java | 3 +- .../assistant/client/model/SimilarDomain.java | 69 ++++++++++++++ .../marginalia/client/AbortingScheduler.java | 17 +--- .../nu/marginalia/client/AbstractClient.java | 27 ++++++ .../client/AbstractDynamicClient.java | 4 +- .../marginalia/client/AbstractClientTest.java | 2 +- .../search/svc/SearchSiteInfoService.java | 62 +++++++------ .../assistant-service/build.gradle | 1 + .../assistant/AssistantService.java | 51 ++++++++++- .../domains}/DomainInformationService.java | 14 +-- .../domains}/SimilarDomainsService.java | 90 +++---------------- .../assistant/suggest/Suggestions.java | 21 +++-- 13 files changed, 252 insertions(+), 141 deletions(-) rename code/{services-application/search-service/src/main/java/nu/marginalia/search => api/assistant-api/src/main/java/nu/marginalia/assistant/client}/model/DomainInformation.java (87%) create mode 100644 code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/SimilarDomain.java rename code/{services-application/search-service/src/main/java/nu/marginalia/search/siteinfo => services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains}/DomainInformationService.java (95%) rename code/{services-application/search-service/src/main/java/nu/marginalia/search/svc => services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains}/SimilarDomainsService.java (84%) diff --git a/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java index eaa9f0f5..aa263599 100644 --- a/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java +++ b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java @@ -1,12 +1,14 @@ package nu.marginalia.assistant.client; +import com.google.gson.reflect.TypeToken; import com.google.inject.Inject; import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; import nu.marginalia.assistant.client.model.DictionaryResponse; +import nu.marginalia.assistant.client.model.DomainInformation; +import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.client.exception.RouteNotConfiguredException; -import nu.marginalia.WmsaHome; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; @@ -14,6 +16,7 @@ import nu.marginalia.client.Context; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.List; @Singleton @@ -59,4 +62,31 @@ public class AssistantClient extends AbstractDynamicClient { return Observable.empty(); } } + + public Observable> similarDomains(Context ctx, int domainId, int count) { + try { + return super.get(ctx, 0, STR."/domain/\{domainId}/similar?count=\{count}", new TypeToken>() {}); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } + } + + public Observable> linkedDomains(Context ctx, int domainId, int count) { + try { + return super.get(ctx, 0, STR."/domain/\{domainId}/linking?count=\{count}", new TypeToken>() {}); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } + } + + public Observable domainInformation(Context ctx, int domainId) { + try { + return super.get(ctx, 0, STR."/domain/\{domainId}/info", DomainInformation.class); + } + catch (RouteNotConfiguredException ex) { + return Observable.empty(); + } + } } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java similarity index 87% rename from code/services-application/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java rename to code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java index 2491258d..5a78fba8 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java +++ b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java @@ -1,4 +1,4 @@ -package nu.marginalia.search.model; +package nu.marginalia.assistant.client.model; import lombok.*; import nu.marginalia.model.EdgeDomain; @@ -24,5 +24,4 @@ public class DomainInformation { boolean unknownDomain; String state; - List linkingDomains; } diff --git a/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/SimilarDomain.java b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/SimilarDomain.java new file mode 100644 index 00000000..ccd7fe5b --- /dev/null +++ b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/SimilarDomain.java @@ -0,0 +1,69 @@ +package nu.marginalia.assistant.client.model; + +import nu.marginalia.model.EdgeUrl; + +public record SimilarDomain(EdgeUrl url, + int domainId, + double relatedness, + double rank, + boolean indexed, + boolean active, + boolean screenshot, + LinkType linkType) { + + public String getRankSymbols() { + if (rank > 90) { + return "★★★★★"; + } + if (rank > 70) { + return "★★★★"; + } + if (rank > 50) { + return "★★★"; + } + if (rank > 30) { + return "★★"; + } + if (rank > 10) { + return "★"; + } + return ""; + } + + public enum LinkType { + BACKWARD, + FOWARD, + BIDIRECTIONAL, + NONE; + + public static LinkType find(boolean linkStod, + boolean linkDtos) { + if (linkDtos && linkStod) + return BIDIRECTIONAL; + if (linkDtos) + return FOWARD; + if (linkStod) + return BACKWARD; + + return NONE; + } + + public String toString() { + return switch (this) { + case FOWARD -> "→"; + case BACKWARD -> "←"; + case BIDIRECTIONAL -> "⇆"; + case NONE -> "-"; + }; + } + + public String getDescription() { + return switch (this) { + case BACKWARD -> "Backward Link"; + case FOWARD -> "Forward Link"; + case BIDIRECTIONAL -> "Mutual Link"; + case NONE -> "No Link"; + }; + } + } +} diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java b/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java index f190bfe4..d603a546 100644 --- a/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java +++ b/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java @@ -1,6 +1,5 @@ package nu.marginalia.client; -import com.google.common.util.concurrent.ThreadFactoryBuilder; import io.reactivex.rxjava3.core.Scheduler; import io.reactivex.rxjava3.schedulers.Schedulers; import org.slf4j.Logger; @@ -10,26 +9,16 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.ThreadFactory; public class AbortingScheduler { - private final ThreadFactory threadFactory; - private final Logger logger = LoggerFactory.getLogger(getClass()); @Nullable private ExecutorService executorService; - public AbortingScheduler(String name) { - threadFactory = new ThreadFactoryBuilder() - .setNameFormat(name+"client--%d") - .setUncaughtExceptionHandler(this::handleException) - .build(); + public AbortingScheduler() { } - private void handleException(Thread thread, Throwable throwable) { - logger.error("Uncaught exception during Client IO in thread {}", thread.getName(), throwable); - } public synchronized Scheduler get() { return Schedulers.from(getExecutorService(), @@ -40,14 +29,14 @@ public class AbortingScheduler { public synchronized void abort() { if (null != executorService) { executorService.shutdownNow(); - executorService = Executors.newFixedThreadPool(16, threadFactory); + executorService = Executors.newVirtualThreadPerTaskExecutor(); } } @Nonnull private synchronized ExecutorService getExecutorService() { if (null == executorService) { - executorService = Executors.newFixedThreadPool(16, threadFactory); + executorService = Executors.newVirtualThreadPerTaskExecutor(); } return executorService; } diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java b/code/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java index 92e1ff0c..697671f0 100644 --- a/code/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java +++ b/code/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java @@ -1,6 +1,7 @@ package nu.marginalia.client; import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; import com.google.protobuf.GeneratedMessageV3; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.core.ObservableSource; @@ -20,6 +21,7 @@ import org.slf4j.LoggerFactory; import spark.utils.IOUtils; import java.io.OutputStream; +import java.lang.reflect.Type; import java.net.ConnectException; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; @@ -233,6 +235,22 @@ public abstract class AbstractClient implements AutoCloseable { .doFinally(() -> ThreadContext.remove("outbound-request")); } + protected synchronized Observable get(Context ctx, int node, String endpoint, TypeToken type) { + ensureAlive(node); + + var req = ctx.paint(new Request.Builder()).url(serviceRoutes.get(node) + endpoint).get().build(); + + return Observable.just(client.newCall(req)) + .subscribeOn(scheduler().get()) + .map(this::logInbound) + .map(Call::execute) + .map(this::logOutbound) + .map(rsp -> validateResponseStatus(rsp, req, 200)) + .map(rsp -> getEntity(rsp, type)) + .retryWhen(this::retryHandler) + .timeout(timeout, TimeUnit.SECONDS) + .doFinally(() -> ThreadContext.remove("outbound-request")); + } protected synchronized Observable get(Context ctx, int node, String endpoint, OutputStream outputStream) { ensureAlive(node); @@ -388,6 +406,15 @@ public abstract class AbstractClient implements AutoCloseable { } } @SneakyThrows + private T getEntity(Response response, TypeToken clazz) { + try (response) { + return gson.fromJson(response.body().charStream(), clazz); + } + catch (Exception ex) { + throw ex; + } + } + @SneakyThrows private String getText(Response response) { try (response) { return response.body().string(); diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java b/code/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java index 6e735010..5d2e3ef6 100644 --- a/code/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java +++ b/code/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java @@ -1,8 +1,6 @@ package nu.marginalia.client; import com.google.gson.Gson; -import nu.marginalia.client.route.RouteProvider; -import nu.marginalia.client.route.ServiceRoute; import nu.marginalia.service.descriptor.ServiceDescriptor; import javax.annotation.Nonnull; @@ -20,7 +18,7 @@ public class AbstractDynamicClient extends AbstractClient { ); this.service = service; - this.scheduler = new AbortingScheduler(name()); + this.scheduler = new AbortingScheduler(); } @Override diff --git a/code/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java b/code/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java index 44445204..2f02a8a7 100644 --- a/code/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java +++ b/code/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java @@ -42,7 +42,7 @@ public class AbstractClientTest { client = new AbstractClient(new RouteProvider(new ServiceDescriptor(ServiceId.Api, "localhost")), 1, Gson::new) { @Override public AbortingScheduler scheduler() { - return new AbortingScheduler(name()); + return new AbortingScheduler(); } @Override diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index c5f918d0..8cb9a84f 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -1,16 +1,16 @@ package nu.marginalia.search.svc; import com.google.inject.Inject; +import nu.marginalia.assistant.client.AssistantClient; +import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.client.Context; import nu.marginalia.db.DbDomainQueries; -import nu.marginalia.db.DomainBlacklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.search.SearchOperator; -import nu.marginalia.search.model.DomainInformation; +import nu.marginalia.assistant.client.model.DomainInformation; import nu.marginalia.search.model.UrlDetails; -import nu.marginalia.search.siteinfo.DomainInformationService; import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData; import spark.Request; import spark.Response; @@ -23,22 +23,19 @@ import java.util.Map; public class SearchSiteInfoService { private final SearchOperator searchOperator; - private final SimilarDomainsService similarDomains; - private final DomainInformationService domainInformationService; + private final AssistantClient assistantClient; private final SearchFlagSiteService flagSiteService; private final DbDomainQueries domainQueries; private final MustacheRenderer renderer; @Inject public SearchSiteInfoService(SearchOperator searchOperator, - SimilarDomainsService similarDomains, - DomainInformationService domainInformationService, + AssistantClient assistantClient, RendererFactory rendererFactory, SearchFlagSiteService flagSiteService, DbDomainQueries domainQueries) throws IOException { this.searchOperator = searchOperator; - this.similarDomains = similarDomains; - this.domainInformationService = domainInformationService; + this.assistantClient = assistantClient; this.flagSiteService = flagSiteService; this.domainQueries = domainQueries; @@ -108,13 +105,6 @@ public class SearchSiteInfoService { false); } - private DomainInformation dummyInformation(String domainName) { - return DomainInformation.builder() - .domain(new EdgeDomain(domainName)) - .suggestForCrawling(true) - .unknownDomain(true) - .build(); - } private Backlinks listLinks(Context ctx, String domainName) { return new Backlinks(domainName, @@ -126,13 +116,24 @@ public class SearchSiteInfoService { final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1); - final DomainInformation domainInfo = domainInformationService.domainInfo(domainName) - .orElseGet(() -> dummyInformation(domainName)); + final DomainInformation domainInfo; + final List similarSet; + final List linkingDomains; - final List similarSet = - similarDomains.getSimilarDomains(domainId, 100); - final List linkingDomains = - similarDomains.getLinkingDomains(domainId, 100); + if (domainId < 0 || !assistantClient.isAccepting()) { + domainInfo = createDummySiteInfo(domainName); + similarSet = List.of(); + linkingDomains = List.of(); + } + else { + domainInfo = assistantClient.domainInformation(ctx, domainId).blockingFirst(); + similarSet = assistantClient + .similarDomains(ctx, domainId, 100) + .blockingFirst(); + linkingDomains = assistantClient + .linkedDomains(ctx, domainId, 100) + .blockingFirst(); + } return new SiteInfoWithContext(domainName, domainId, @@ -141,6 +142,15 @@ public class SearchSiteInfoService { linkingDomains ); } + + private DomainInformation createDummySiteInfo(String domainName) { + return DomainInformation.builder() + .domain(new EdgeDomain(domainName)) + .suggestForCrawling(true) + .unknownDomain(true) + .build(); + } + private Docs listDocs(Context ctx, String domainName) { return new Docs(domainName, domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), @@ -181,13 +191,13 @@ public class SearchSiteInfoService { String domain, long domainId, DomainInformation domainInformation, - List similar, - List linking) { + List similar, + List linking) { public SiteInfoWithContext(String domain, long domainId, DomainInformation domainInformation, - List similar, - List linking + List similar, + List linking ) { this(Map.of("info", true), diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index 89f90eee..e2c792fb 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -27,6 +27,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:common:service') implementation project(':code:common:model') + implementation project(':code:common:db') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java index 3992986b..592e6308 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java @@ -3,6 +3,8 @@ package nu.marginalia.assistant; import com.google.gson.Gson; import com.google.inject.Inject; import lombok.SneakyThrows; +import nu.marginalia.assistant.domains.DomainInformationService; +import nu.marginalia.assistant.domains.SimilarDomainsService; import nu.marginalia.assistant.eval.Units; import nu.marginalia.assistant.suggest.Suggestions; import nu.marginalia.assistant.eval.MathParser; @@ -16,11 +18,16 @@ import spark.Request; import spark.Response; import spark.Spark; +import java.util.ArrayList; +import java.util.Objects; + public class AssistantService extends Service { private final Logger logger = LoggerFactory.getLogger(getClass()); private final Gson gson = GsonFactory.get(); private final Units units; private final MathParser mathParser; + private final SimilarDomainsService similarDomainsService; + private final DomainInformationService domainInformationService; private final Suggestions suggestions; @SneakyThrows @@ -30,12 +37,16 @@ public class AssistantService extends Service { MathParser mathParser, Units units, ScreenshotService screenshotService, + SimilarDomainsService similarDomainsService, + DomainInformationService domainInformationService, Suggestions suggestions) { super(params); this.mathParser = mathParser; this.units = units; + this.similarDomainsService = similarDomainsService; + this.domainInformationService = domainInformationService; this.suggestions = suggestions; Spark.staticFiles.expireTime(600); @@ -56,12 +67,50 @@ public class AssistantService extends Service { rsp, req.queryParams("value") )); - + Spark.get("/domain/:id/similar", this::getSimilarDomains, this::convertToJson); + Spark.get("/domain/:id/linking", this::getLinkingDomains, this::convertToJson); + Spark.get("/domain/:id/info", this::getDomainInformation, this::convertToJson); Spark.get("/public/suggest/", this::getSuggestions, this::convertToJson); Spark.awaitInitialization(); } + private Object getSimilarDomains(Request request, Response response) { + int domainId = Integer.parseInt(request.params("id")); + int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25")); + + response.type("application/json"); + + if (!similarDomainsService.isReady()) { + return new ArrayList<>(); + } + + return similarDomainsService.getSimilarDomains(domainId, count); + } + + private Object getLinkingDomains(Request request, Response response) { + int domainId = Integer.parseInt(request.params("id")); + int count = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("count"), "25")); + + response.type("application/json"); + if (!similarDomainsService.isReady()) { + return new ArrayList<>(); + } + return similarDomainsService.getLinkingDomains(domainId, count); + } + + private Object getDomainInformation(Request request, Response response) { + int domainId = Integer.parseInt(request.params("id")); + + response.type("application/json"); + + var maybeDomainInfo = domainInformationService.domainInfo(domainId); + if (maybeDomainInfo.isEmpty()) { + Spark.halt(404); + } + return maybeDomainInfo.get(); + } + private Object getSuggestions(Request request, Response response) { response.type("application/json"); var param = request.queryParams("partial"); diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java similarity index 95% rename from code/services-application/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java rename to code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java index c05cfec2..948dfd44 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java @@ -1,11 +1,11 @@ -package nu.marginalia.search.siteinfo; +package nu.marginalia.assistant.domains; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.db.DbDomainQueries; -import nu.marginalia.search.model.DomainInformation; +import nu.marginalia.assistant.client.model.DomainInformation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,13 +36,7 @@ public class DomainInformationService { } - public Optional domainInfo(String site) { - - OptionalInt maybeDomainId = getDomainFromPartial(site); - if (maybeDomainId.isEmpty()) { - return Optional.empty(); - } - int domainId = maybeDomainId.getAsInt(); + public Optional domainInfo(int domainId) { Optional domain = dbDomainQueries.getDomain(domainId); if (domain.isEmpty()) { @@ -61,7 +55,6 @@ public class DomainInformationService { double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; DomainIndexingState state = getDomainState(domainId); - List linkingDomains = getLinkingDomains(domainId); var di = DomainInformation.builder() .domain(domain.get()) @@ -73,7 +66,6 @@ public class DomainInformationService { .outboundLinks(outboundLinks) .ranking(rank) .state(state.desc) - .linkingDomains(linkingDomains) .inCrawlQueue(inCrawlQueue) .nodeAffinity(nodeAffinity) .suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue)) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SimilarDomainsService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java similarity index 84% rename from code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SimilarDomainsService.java rename to code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java index 51699dcc..9156408c 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SimilarDomainsService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java @@ -1,4 +1,4 @@ -package nu.marginalia.search.svc; +package nu.marginalia.assistant.domains; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; @@ -6,11 +6,10 @@ import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntDoubleHashMap; import gnu.trove.map.hash.TIntIntHashMap; -import gnu.trove.map.hash.TLongDoubleHashMap; import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,6 +39,8 @@ public class SimilarDomainsService { public volatile double[] domainRanks = null; public volatile String[] domainNames = null; + volatile boolean isReady = false; + @Inject public SimilarDomainsService(HikariDataSource dataSource) { this.dataSource = dataSource; @@ -167,6 +168,7 @@ public class SimilarDomainsService { logger.info("Loaded {} domains", domainRanks.length); logger.info("All done!"); + isReady = true; } } catch (SQLException throwables) { @@ -174,7 +176,11 @@ public class SimilarDomainsService { } } - double getRelatedness(int a, int b) { + public boolean isReady() { + return isReady; + } + + private double getRelatedness(int a, int b) { int lowerIndex = Math.min(domainIdToIdx.get(a), domainIdToIdx.get(b)); int higherIndex = Math.max(domainIdToIdx.get(a), domainIdToIdx.get(b)); @@ -233,14 +239,14 @@ public class SimilarDomainsService { indexedDomains.get(idx), activeDomains.get(idx), screenshotDomains.get(idx), - LinkType.find( + SimilarDomain.LinkType.find( linkingIdsStoD.contains(idx), linkingIdsDtoS.contains(idx) ) )); } - domains.removeIf(d -> d.url.domain.toString().length() > 32); + domains.removeIf(d -> d.url().domain.toString().length() > 32); return domains; } @@ -319,84 +325,16 @@ public class SimilarDomainsService { indexedDomains.get(idx), activeDomains.get(idx), screenshotDomains.get(idx), - LinkType.find( + SimilarDomain.LinkType.find( linkingIdsStoD.contains(idx), linkingIdsDtoS.contains(idx) ) )); } - domains.removeIf(d -> d.url.domain.toString().length() > 32); + domains.removeIf(d -> d.url().domain.toString().length() > 32); return domains; } - public record SimilarDomain(EdgeUrl url, - int domainId, - double relatedness, - double rank, - boolean indexed, - boolean active, - boolean screenshot, - LinkType linkType) - { - - public String getRankSymbols() { - if (rank > 90) { - return "★★★★★"; - } - if (rank > 70) { - return "★★★★"; - } - if (rank > 50) { - return "★★★"; - } - if (rank > 30) { - return "★★"; - } - if (rank > 10) { - return "★"; - } - return ""; - } - } - - enum LinkType { - BACKWARD, - FOWARD, - BIDIRECTIONAL, - NONE; - - public static LinkType find(boolean linkStod, - boolean linkDtos) - { - if (linkDtos && linkStod) - return BIDIRECTIONAL; - if (linkDtos) - return FOWARD; - if (linkStod) - return BACKWARD; - - return NONE; - } - - public String toString() { - return switch (this) { - case FOWARD -> "→"; - case BACKWARD -> "←"; - case BIDIRECTIONAL -> "⇆"; - case NONE -> "-"; - }; - } - - public String getDescription() { - return switch (this) { - case BACKWARD -> "Backward Link"; - case FOWARD -> "Forward Link"; - case BIDIRECTIONAL -> "Mutual Link"; - case NONE -> "No Link"; - }; - } - }; - } \ No newline at end of file diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java index a75ab75a..7adf7921 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java @@ -20,8 +20,9 @@ import java.util.stream.Collectors; import java.util.stream.Stream; public class Suggestions { - private final PatriciaTrie suggestionsTrie; - private final TermFrequencyDict termFrequencyDict; + private PatriciaTrie suggestionsTrie = null; + private TermFrequencyDict termFrequencyDict = null; + private volatile boolean ready = false; private final SpellChecker spellChecker; private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$"); @@ -35,10 +36,12 @@ public class Suggestions { ) { this.spellChecker = spellChecker; - suggestionsTrie = loadSuggestions(suggestionsFile); - termFrequencyDict = dict; - - logger.info("Loaded {} suggestions", suggestionsTrie.size()); + Thread.ofPlatform().start(() -> { + suggestionsTrie = loadSuggestions(suggestionsFile); + termFrequencyDict = dict; + ready = true; + logger.info("Loaded {} suggestions", suggestionsTrie.size()); + }); } private static PatriciaTrie loadSuggestions(Path file) { @@ -71,6 +74,9 @@ public class Suggestions { } public List getSuggestions(int count, String searchWord) { + if (!ready) + return Collections.emptyList(); + if (searchWord.length() < MIN_SUGGEST_LENGTH) { return Collections.emptyList(); } @@ -126,6 +132,9 @@ public class Suggestions { public Stream getSuggestionsForKeyword(int count, String prefix) { + if (!ready) + return Stream.empty(); + if (prefix.length() < MIN_SUGGEST_LENGTH) { return Stream.empty(); } From 6382f779c395e0dafac48610f6141a8f8afaa0de Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 16:34:12 +0100 Subject: [PATCH 07/15] (search) Revert back to using 'Popular' as the default search filter Unfiltered is a bit too ... unfiltered, and gives a bad first impression for many queries. --- .../src/main/java/nu/marginalia/search/SearchService.java | 1 - .../main/java/nu/marginalia/search/model/SearchProfile.java | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchService.java index 13f14e6e..1ed36a98 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchService.java @@ -31,7 +31,6 @@ public class SearchService extends Service { SearchFrontPageService frontPageService, SearchErrorPageService errorPageService, SearchAddToCrawlQueueService addToCrawlQueueService, - SearchFlagSiteService flagSiteService, SearchSiteInfoService siteInfoService, SearchQueryService searchQueryService ) { diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index ab61ca11..c1ed2c0b 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -38,7 +38,7 @@ public enum SearchProfile { private final static SearchProfile[] values = values(); public static SearchProfile getSearchProfile(String param) { if (null == param) { - return NO_FILTER; + return DEFAULT; } for (var profile : values) { @@ -47,7 +47,7 @@ public enum SearchProfile { } } - return NO_FILTER; + return DEFAULT; } public void addTacitTerms(SearchSubquery subquery) { From e3ebb0c5bb2e8ca9c2b12ce4f71522708f02cb1a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 16:39:46 +0100 Subject: [PATCH 08/15] (*) Rename the search filter 'RETRO' into 'POPULAR' This will make the terminology more consistent between the GUI and the code. The rankings yaml still uses 'retro' though, for to retain compatibility. --- .../model/query/SearchSetIdentifier.java | 2 +- .../nu/marginalia/api/ApiSearchOperator.java | 4 +--- .../marginalia/search/model/SearchFilters.java | 3 +-- .../marginalia/search/model/SearchProfile.java | 7 +++---- .../index/svc/IndexSearchSetsService.java | 18 +++++++++--------- 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java index e89d6d8b..4eb0eae0 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java @@ -7,7 +7,7 @@ package nu.marginalia.index.client.model.query; * */ public enum SearchSetIdentifier { NONE, - RETRO, + POPULAR, BLOGS, ACADEMIA, SMALLWEB diff --git a/code/services-application/api-service/src/main/java/nu/marginalia/api/ApiSearchOperator.java b/code/services-application/api-service/src/main/java/nu/marginalia/api/ApiSearchOperator.java index 7dca777c..f60a1750 100644 --- a/code/services-application/api-service/src/main/java/nu/marginalia/api/ApiSearchOperator.java +++ b/code/services-application/api-service/src/main/java/nu/marginalia/api/ApiSearchOperator.java @@ -10,8 +10,6 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.client.model.results.DecoratedSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultKeywordScore; import nu.marginalia.index.query.limit.QueryLimits; -import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.query.client.QueryClient; import nu.marginalia.query.model.QueryParams; @@ -64,7 +62,7 @@ public class ApiSearchOperator { return switch (index) { case 0 -> SearchSetIdentifier.NONE; case 1 -> SearchSetIdentifier.SMALLWEB; - case 2 -> SearchSetIdentifier.RETRO; + case 2 -> SearchSetIdentifier.POPULAR; case 3 -> SearchSetIdentifier.NONE; case 5 -> SearchSetIdentifier.NONE; default -> SearchSetIdentifier.NONE; diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java index 0ce79b31..3afdef7f 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java @@ -5,7 +5,6 @@ import nu.marginalia.WebsiteUrl; import nu.marginalia.search.command.SearchAdtechParameter; import nu.marginalia.search.command.SearchJsParameter; import nu.marginalia.search.command.SearchParameters; -import org.apache.regexp.RE; import java.util.List; @@ -37,7 +36,7 @@ public class SearchFilters { filterGroups = List.of( List.of( new Filter("No Filter", SearchProfile.NO_FILTER, parameters), - new Filter("Popular", SearchProfile.DEFAULT, parameters), + new Filter("Popular", SearchProfile.POPULAR, parameters), new Filter("Small Web", SearchProfile.SMALLWEB, parameters), new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters), new Filter("Academia", SearchProfile.ACADEMIA, parameters) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index c1ed2c0b..97bf4d09 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -8,11 +8,10 @@ import nu.marginalia.index.client.model.query.SearchSetIdentifier; import java.util.Objects; public enum SearchProfile { - DEFAULT("default", SearchSetIdentifier.RETRO), + POPULAR("default", SearchSetIdentifier.POPULAR), SMALLWEB("modern", SearchSetIdentifier.SMALLWEB), BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS), NO_FILTER("corpo", SearchSetIdentifier.NONE), - YOLO("yolo", SearchSetIdentifier.NONE), VINTAGE("vintage", SearchSetIdentifier.NONE), TILDE("tilde", SearchSetIdentifier.NONE), CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE), @@ -38,7 +37,7 @@ public enum SearchProfile { private final static SearchProfile[] values = values(); public static SearchProfile getSearchProfile(String param) { if (null == param) { - return DEFAULT; + return POPULAR; } for (var profile : values) { @@ -47,7 +46,7 @@ public enum SearchProfile { } } - return DEFAULT; + return POPULAR; } public void addTacitTerms(SearchSubquery subquery) { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index 4c06bf2f..47dcf5b2 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -37,7 +37,7 @@ public class IndexSearchSetsService { // Below are binary indices that are used to constrain a search - private volatile RankingSearchSet retroSet; + private volatile RankingSearchSet popularSet; private volatile RankingSearchSet smallWebSet; private volatile RankingSearchSet academiaSet; private volatile RankingSearchSet blogsSet; @@ -72,7 +72,7 @@ public class IndexSearchSetsService { smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat")); - retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat")); + popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, servicesFactory.getSearchSetsBase().resolve("popular.dat")); blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat")); } @@ -86,7 +86,7 @@ public class IndexSearchSetsService { } return switch (searchSetIdentifier) { case NONE -> anySet; - case RETRO -> retroSet; + case POPULAR -> popularSet; case ACADEMIA -> academiaSet; case SMALLWEB -> smallWebSet; case BLOGS -> blogsSet; @@ -95,7 +95,7 @@ public class IndexSearchSetsService { enum RepartitionSteps { UPDATE_ACADEMIA, - UPDATE_RETRO, + UPDATE_POPULAR, UPDATE_SMALL_WEB, UPDATE_BLOGS, UPDATE_RANKINGS, @@ -107,8 +107,8 @@ public class IndexSearchSetsService { processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA); updateAcademiaDomainsSet(); - processHeartbeat.progress(RepartitionSteps.UPDATE_RETRO); - updateRetroDomainsSet(); + processHeartbeat.progress(RepartitionSteps.UPDATE_POPULAR); + updatePopularDomainsSet(); processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB); updateSmallWebDomainsSet(); @@ -139,15 +139,15 @@ public class IndexSearchSetsService { } @SneakyThrows - public void updateRetroDomainsSet() { + public void updatePopularDomainsSet() { var entry = rankingSettings.retro; var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new)); var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new); synchronized (this) { - retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data); - retroSet.write(); + popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, popularSet.source, data); + popularSet.write(); } } From f0e736d4eab4ac7b82386a06df7d8b82677b4af3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 16:46:51 +0100 Subject: [PATCH 09/15] (search) Update the search profile 'Academia' to strictly filter on academic tlds The previous version used a personalized pagerank centering on a few academic domains, but this didn't work very well and most results were not very academia-centric. --- .../marginalia/search/model/SearchProfile.java | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index 97bf4d09..630e7f1a 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -15,11 +15,9 @@ public enum SearchProfile { VINTAGE("vintage", SearchSetIdentifier.NONE), TILDE("tilde", SearchSetIdentifier.NONE), CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE), - ACADEMIA("academia", SearchSetIdentifier.ACADEMIA), + ACADEMIA("academia", SearchSetIdentifier.NONE), PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE), FOOD("food", SearchSetIdentifier.NONE), - CRAFTS("crafts", SearchSetIdentifier.NONE), - CLASSICS("classics", SearchSetIdentifier.NONE), FORUM("forum", SearchSetIdentifier.NONE), WIKI("wiki", SearchSetIdentifier.NONE), DOCS("docs", SearchSetIdentifier.NONE), @@ -51,7 +49,7 @@ public enum SearchProfile { public void addTacitTerms(SearchSubquery subquery) { if (this == ACADEMIA) { - subquery.searchTermsPriority.add("tld:edu"); + subquery.searchTermsAdvice.add("special:academia"); } if (this == VINTAGE) { subquery.searchTermsPriority.add("format:html123"); @@ -75,9 +73,6 @@ public enum SearchProfile { if (this == FOOD) { subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); } - if (this == CRAFTS) { - subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword()); - } } public SpecificationLimit getYearLimit() { @@ -105,13 +100,5 @@ public enum SearchProfile { else return SpecificationLimit.none(); } - - - public String getNearDomain() { - if (this == CLASSICS) { - return "classics.mit.edu"; - } - return null; - } } From 37af60254fad941b19654b88d2027cf684b3a6a9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 16:53:06 +0100 Subject: [PATCH 10/15] (search) Better recipe filter Tune the recipe filter to give better results, by using the 'popular' domains set along with excluding results with heavy tracking. --- .../main/java/nu/marginalia/search/model/SearchProfile.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index 630e7f1a..e0daa79c 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -17,7 +17,7 @@ public enum SearchProfile { CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE), ACADEMIA("academia", SearchSetIdentifier.NONE), PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE), - FOOD("food", SearchSetIdentifier.NONE), + FOOD("food", SearchSetIdentifier.POPULAR), FORUM("forum", SearchSetIdentifier.NONE), WIKI("wiki", SearchSetIdentifier.NONE), DOCS("docs", SearchSetIdentifier.NONE), @@ -72,6 +72,7 @@ public enum SearchProfile { } if (this == FOOD) { subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); + subquery.searchTermsExclude.add("special:ads"); } } From 91dd45cf64fdabfd5b21af1783d3ceb3cad89940 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 20:04:27 +0100 Subject: [PATCH 11/15] (search) IP and IP geolocation in site info view This commit also fixes a bug in the loader where the IP field wouldn't always populate as intended, and refactors the DomainInformationService to use significantly fewer SQL queries. --- .../client/model/DomainInformation.java | 19 +- .../ip_blocklist/GeoIpBlocklist.java | 1 + .../DomainRecordParquetFileReader.java | 5 +- .../model/processed/DomainRecord.java | 19 +- .../model/processed/DomainWithIp.java | 6 + .../loading/domains/DomainLoaderService.java | 36 ++- .../domains/DomainLoaderServiceTest.java | 6 +- .../site-info/site-info-index-indexed.hdb | 1 + .../domains/DomainInformationService.java | 280 ++++-------------- .../assistant/domains/GeoIpDictionary.java | 76 +++++ 10 files changed, 201 insertions(+), 248 deletions(-) create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java create mode 100644 code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java diff --git a/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java index 5a78fba8..5cf28278 100644 --- a/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java +++ b/code/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DomainInformation.java @@ -3,8 +3,6 @@ package nu.marginalia.assistant.client.model; import lombok.*; import nu.marginalia.model.EdgeDomain; -import java.util.List; - @Getter @AllArgsConstructor @NoArgsConstructor @Builder @ToString public class DomainInformation { @@ -23,5 +21,22 @@ public class DomainInformation { boolean inCrawlQueue; boolean unknownDomain; + String ip; + String ipCountry; String state; + + public String getIpFlag() { + if (ipCountry == null || ipCountry.isBlank()) { + return ""; + } + String country = ipCountry; + if ("UK".equals(country)) { + return "GB"; + } + int offset = 0x1F1E6; + int asciiOffset = 0x41; + int firstChar = Character.codePointAt(country, 0) - asciiOffset + offset; + int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset; + return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar)); + } } diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java index f91ab135..ba896317 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java @@ -18,6 +18,7 @@ import java.util.TreeMap; @Singleton public class GeoIpBlocklist { private final TreeMap ranges = new TreeMap<>(); + private final Set blacklist = Set.of("CN", "HK"); private final Set graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA"); diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java index a31b199d..a0714557 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java @@ -3,6 +3,7 @@ package nu.marginalia.io.processed; import blue.strategic.parquet.HydratorSupplier; import blue.strategic.parquet.ParquetReader; import nu.marginalia.model.processed.DomainRecord; +import nu.marginalia.model.processed.DomainWithIp; import org.jetbrains.annotations.NotNull; import java.io.IOException; @@ -19,10 +20,10 @@ public class DomainRecordParquetFileReader { } @NotNull - public static List getDomainNames(Path path) throws IOException { + public static List getBasicDomainInformation(Path path) throws IOException { return ParquetReader.streamContent(path.toFile(), HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()), - List.of("domain")) + List.of("domain", "ip")) .toList(); } diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java index e3a0c9f9..6b3491bf 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java @@ -48,8 +48,8 @@ public class DomainRecord { return DomainRecord::dehydrate; } - public static Hydrator newDomainNameHydrator() { - return new DomainNameHydrator(); + public static Hydrator newDomainNameHydrator() { + return new DomainWithIpHydrator(); } @@ -124,23 +124,26 @@ class DomainHydrator implements Hydrator { } } -class DomainNameHydrator implements Hydrator { +class DomainWithIpHydrator implements Hydrator { @Override - public String start() { - return ""; + public DomainWithIp start() { + return new DomainWithIp(); } @Override - public String add(String target, String heading, Object value) { + public DomainWithIp add(DomainWithIp target, String heading, Object value) { if ("domain".equals(heading)) { - return (String) value; + target.domain = (String) value; + } + else if ("ip".equals(heading)) { + target.ip = (String) value; } return target; } @Override - public String finish(String target) { + public DomainWithIp finish(DomainWithIp target) { return target; } } \ No newline at end of file diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java new file mode 100644 index 00000000..bedae4d5 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java @@ -0,0 +1,6 @@ +package nu.marginalia.model.processed; + +public class DomainWithIp { + public String domain; + public String ip; +} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java index 3ce30d96..911c976d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java @@ -9,6 +9,7 @@ import nu.marginalia.io.processed.DomainRecordParquetFileReader; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.processed.DomainRecord; +import nu.marginalia.model.processed.DomainWithIp; import nu.marginalia.process.control.ProcessHeartbeatImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -51,9 +52,9 @@ public class DomainLoaderService { ) { try (var inserter = new DomainInserter(conn, nodeId)) { - for (var domain : readSetDomainNames(inputData)) { - inserter.accept(new EdgeDomain(domain)); - domainNamesAll.add(domain); + for (var domainWithIp : readBasicDomainInformation(inputData)) { + inserter.accept(new EdgeDomain(domainWithIp.domain)); + domainNamesAll.add(domainWithIp.domain); } } try (var inserter = new DomainInserter(conn, -1)) { @@ -63,9 +64,9 @@ public class DomainLoaderService { } } - try (var updater = new DomainAffinityUpdater(conn, nodeId)) { - for (var domain : readSetDomainNames(inputData)) { - updater.accept(new EdgeDomain(domain)); + try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId)) { + for (var domainWithIp : readBasicDomainInformation(inputData)) { + updater.accept(new EdgeDomain(domainWithIp.domain), domainWithIp.ip); } } @@ -84,15 +85,15 @@ public class DomainLoaderService { return ret; } - Collection readSetDomainNames(LoaderInputData inputData) throws IOException { - final Set domainNamesAll = new HashSet<>(100_000); + Collection readBasicDomainInformation(LoaderInputData inputData) throws IOException { + final Set domainsAll = new HashSet<>(100_000); var domainFiles = inputData.listDomainFiles(); for (var file : domainFiles) { - domainNamesAll.addAll(DomainRecordParquetFileReader.getDomainNames(file)); + domainsAll.addAll(DomainRecordParquetFileReader.getBasicDomainInformation(file)); } - return domainNamesAll; + return domainsAll; } Collection readReferencedDomainNames(LoaderInputData inputData) throws IOException { @@ -164,20 +165,25 @@ public class DomainLoaderService { statement.close(); } } - private static class DomainAffinityUpdater implements AutoCloseable { + private static class DomainAffinityAndIpUpdater implements AutoCloseable { private final PreparedStatement statement; private final int nodeAffinity; private int count = 0; - public DomainAffinityUpdater(Connection connection, int affinity) throws SQLException { + public DomainAffinityAndIpUpdater(Connection connection, int affinity) throws SQLException { this.nodeAffinity = affinity; - statement = connection.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE DOMAIN_NAME=?"); + statement = connection.prepareStatement(""" + UPDATE EC_DOMAIN + SET NODE_AFFINITY = ?, IP = ? + WHERE DOMAIN_NAME=? + """); } - public void accept(EdgeDomain domain) throws SQLException { + public void accept(EdgeDomain domain, String ip) throws SQLException { statement.setInt(1, nodeAffinity); - statement.setString(2, domain.toString()); + statement.setString(2, ip); + statement.setString(3, domain.toString()); statement.addBatch(); if (++count > 1000) { diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java index e199580f..8f2aee4e 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java @@ -6,7 +6,6 @@ import nu.marginalia.ProcessConfiguration; import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; import nu.marginalia.io.processed.DomainRecordParquetFileWriter; import nu.marginalia.io.processed.ProcessedDataFileNames; -import nu.marginalia.loader.DbTestUtil; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.model.processed.DomainLinkRecord; import nu.marginalia.model.processed.DomainRecord; @@ -21,10 +20,7 @@ import org.testcontainers.junit.jupiter.Testcontainers; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.sql.SQLException; import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.*; @@ -99,7 +95,7 @@ class DomainLoaderServiceTest { // Verify Set expectedDomains1 = Sets.union(new HashSet<>(domains1), new HashSet<>(domains2)); - assertEquals(expectedDomains1, domainService.readSetDomainNames(new LoaderInputData(workDir, 2))); + assertEquals(expectedDomains1, domainService.readBasicDomainInformation(new LoaderInputData(workDir, 2))); Set expectedDomains2 = new HashSet<>(linkDomains); assertEquals(expectedDomains2, domainService.readReferencedDomainNames(new LoaderInputData(workDir, 2))); diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb index 5b6e40dd..979226a1 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb @@ -6,5 +6,6 @@ Pages Known: {{pagesKnown}}
Pages Crawled: {{pagesFetched}}
Pages Indexed: {{pagesIndexed}}
+ IP: {{ip}} {{#if ipCountry}}{{getIpFlag}}{{/if}}

\ No newline at end of file diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java index 948dfd44..4da309dc 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java @@ -11,17 +11,14 @@ import org.slf4j.LoggerFactory; import com.google.inject.Inject; import com.google.inject.Singleton; + +import java.sql.ResultSet; import java.sql.SQLException; import java.util.*; -/* - TODO: This class needs to be refactored, a lot of - these SQL queries are redundant and can be - collapsed into one single query that fetches - all the information - */ @Singleton public class DomainInformationService { + private final GeoIpDictionary geoIpDictionary; private DbDomainQueries dbDomainQueries; private HikariDataSource dataSource; @@ -30,8 +27,10 @@ public class DomainInformationService { @Inject public DomainInformationService( DbDomainQueries dbDomainQueries, + GeoIpDictionary geoIpDictionary, HikariDataSource dataSource) { this.dbDomainQueries = dbDomainQueries; + this.geoIpDictionary = geoIpDictionary; this.dataSource = dataSource; } @@ -43,226 +42,75 @@ public class DomainInformationService { return Optional.empty(); } - boolean blacklisted = isBlacklisted(domain.get()); - int pagesKnown = getPagesKnown(domainId); - int pagesVisited = getPagesVisited(domainId); - int pagesIndexed = getPagesIndexed(domainId); - int incomingLinks = getIncomingLinks(domainId); - int outboundLinks = getOutboundLinks(domainId); - int nodeAffinity = getNodeAffinity(domainId); - boolean inCrawlQueue = inCrawlQueue(domainId); - double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; + var builder = DomainInformation.builder(); + try (var connection = dataSource.getConnection(); + var stmt = connection.createStatement(); + ) { + boolean inCrawlQueue; + int outboundLinks = 0; + int pagesVisited = 0; - DomainIndexingState state = getDomainState(domainId); + ResultSet rs; - var di = DomainInformation.builder() - .domain(domain.get()) - .blacklisted(blacklisted) - .pagesKnown(pagesKnown) - .pagesFetched(pagesVisited) - .pagesIndexed(pagesIndexed) - .incomingLinks(incomingLinks) - .outboundLinks(outboundLinks) - .ranking(rank) - .state(state.desc) - .inCrawlQueue(inCrawlQueue) - .nodeAffinity(nodeAffinity) - .suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue)) - .build(); + rs = stmt.executeQuery(STR.""" + SELECT IP, NODE_AFFINITY, DOMAIN_NAME, STATE, IFNULL(RANK, 1) AS RANK + FROM EC_DOMAIN WHERE ID=\{domainId} + """); + if (rs.next()) { + String ip = rs.getString("IP"); - return Optional.of(di); - } + builder.ip(ip); + builder.ipCountry(geoIpDictionary.getCountry(ip)); - private int getNodeAffinity(int domainId) { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement(""" - SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE ID=? - """)) { - stmt.setInt(1, domainId); - var rs = stmt.executeQuery(); - if (rs.next()) - return rs.getInt(1); + builder.nodeAffinity(rs.getInt("NODE_AFFINITY")); + builder.domain(new EdgeDomain(rs.getString("DOMAIN_NAME"))); + builder.state(rs.getString("STATE")); + builder.ranking(Math.round(100.0*(1.0-rs.getDouble("RANK")))); } + rs = stmt.executeQuery(STR.""" + SELECT 1 FROM CRAWL_QUEUE + INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME + WHERE EC_DOMAIN.ID=\{domainId} + """); + inCrawlQueue = rs.next(); + builder.inCrawlQueue(inCrawlQueue); + + rs = stmt.executeQuery(STR.""" + SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=\{domainId} + """); + if (rs.next()) { + builder.incomingLinks(rs.getInt(1)); + } + + rs = stmt.executeQuery(STR.""" + SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=\{domainId} + """); + if (rs.next()) { + builder.outboundLinks(rs.getInt(1)); + outboundLinks = rs.getInt(1); + } + + + rs = stmt.executeQuery(STR.""" + SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId} + """); + if (rs.next()) { + pagesVisited = rs.getInt("VISITED_URLS"); + + builder.pagesKnown(rs.getInt("KNOWN_URLS")); + builder.pagesIndexed(rs.getInt("GOOD_URLS")); + builder.pagesFetched(rs.getInt("VISITED_URLS")); + } + + builder.suggestForCrawling((pagesVisited == 0 && outboundLinks == 0 && !inCrawlQueue)); + + return Optional.of(builder.build()); } catch (SQLException ex) { logger.error("SQL error", ex); - } - return -1; - } - - @SneakyThrows - private boolean inCrawlQueue(int domainId) { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - SELECT 1 FROM CRAWL_QUEUE - INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME - WHERE EC_DOMAIN.ID=? - """)) - { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - return rsp.next(); - } + return Optional.empty(); } } - private OptionalInt getDomainFromPartial(String site) { - return dbDomainQueries.tryGetDomainId(new EdgeDomain(site)); - } - - @SneakyThrows - public boolean isBlacklisted(EdgeDomain domain) { - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN IN (?,?)")) { - stmt.setString(1, domain.domain); - stmt.setString(2, domain.toString()); - var rsp = stmt.executeQuery(); - return rsp.next(); - } - } - } - - @SneakyThrows - public int getPagesKnown(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - public int getPagesVisited(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - - @SneakyThrows - public int getPagesIndexed(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - public int getIncomingLinks(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - @SneakyThrows - public int getOutboundLinks(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - public DomainIndexingState getDomainState(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return DomainIndexingState.valueOf(rsp.getString(1)); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return DomainIndexingState.ERROR; - } - - public List getLinkingDomains(int domainId) { - try (var connection = dataSource.getConnection()) { - List results = new ArrayList<>(25); - try (var stmt = connection.prepareStatement("SELECT SOURCE_DOMAIN FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - results.add(new EdgeDomain(rsp.getString(1))); - } - return results; - } catch (Exception ex) { - logger.error("DB error", ex); - } - - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return Collections.emptyList(); - } - - public double getRank(int domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return 1; - } } diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java new file mode 100644 index 00000000..e250761e --- /dev/null +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java @@ -0,0 +1,76 @@ +package nu.marginalia.assistant.domains; + +import com.opencsv.CSVReader; +import lombok.AllArgsConstructor; +import nu.marginalia.WmsaHome; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileReader; +import java.net.InetAddress; +import java.util.TreeMap; + +public class GeoIpDictionary { + private volatile TreeMap ranges = null; + private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class); + + @AllArgsConstructor + static class IpRange { + public final long from; + public final long to; + public final String country; + } + + public GeoIpDictionary() { + Thread.ofPlatform().start(() -> { + try (var reader = new CSVReader(new FileReader(WmsaHome.getIPLocationDatabse().toFile()))) { + var dict = new TreeMap(); + + for (;;) { + String[] vals = reader.readNext(); + if (vals == null) { + break; + } + var range = new IpRange(Long.parseLong(vals[0]), + Long.parseLong(vals[1]), + vals[2]); + dict.put(range.from, range); + } + ranges = dict; + logger.info("Loaded {} IP ranges", ranges.size()); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + + } + + public String getCountry(String ip) { + try { + return getCountry(InetAddress.getByName(ip)); + } catch (Exception e) { + return ""; + } + } + + public String getCountry(InetAddress address) { + if (null == ranges) { // not loaded yet or failed to load + return ""; + } + + byte[] bytes = address.getAddress(); + long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF); + + Long key = ranges.floorKey(ival); + if (null == key) { + return ""; + } + + var range = ranges.get(key); + if (ival >= key && ival < range.to) { + return range.country; + } + + return ""; + } +} From 84b415855561fbfc0f4c88429b0393949782d688 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 10 Dec 2023 14:29:18 +0100 Subject: [PATCH 12/15] (minor) Fix broken test --- .../io/processed/DomainRecordParquetFileReaderTest.java | 8 ++++++-- .../loading/domains/DomainLoaderServiceTest.java | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java index 7c73f13e..b1867100 100644 --- a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java +++ b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java @@ -1,6 +1,7 @@ package nu.marginalia.io.processed; import nu.marginalia.model.processed.DomainRecord; +import nu.marginalia.model.processed.DomainWithIp; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -53,8 +54,11 @@ class DomainRecordParquetFileReaderTest { writer.write(second); } - var domainNames = DomainRecordParquetFileReader.getDomainNames(parquetFile); - assertEquals(List.of("www.marginalia.nu", "memex.marginalia.nu"), domainNames); + var domainInfo = DomainRecordParquetFileReader.getBasicDomainInformation(parquetFile); + assertEquals(List.of( + new DomainWithIp("www.marginalia.nu", "127.0.0.1"), + new DomainWithIp("memex.marginalia.nu", "127.0.0.1")), + domainInfo); var items = DomainRecordParquetFileReader .stream(parquetFile) diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java index 8f2aee4e..d235fe74 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.*; @@ -95,7 +96,7 @@ class DomainLoaderServiceTest { // Verify Set expectedDomains1 = Sets.union(new HashSet<>(domains1), new HashSet<>(domains2)); - assertEquals(expectedDomains1, domainService.readBasicDomainInformation(new LoaderInputData(workDir, 2))); + assertEquals(expectedDomains1, domainService.readBasicDomainInformation(new LoaderInputData(workDir, 2)).stream().map(d -> d.domain).collect(Collectors.toSet())); Set expectedDomains2 = new HashSet<>(linkDomains); assertEquals(expectedDomains2, domainService.readReferencedDomainNames(new LoaderInputData(workDir, 2))); From f655ec5a5c996abe77bdad6e7e043571db361792 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 10 Dec 2023 17:30:43 +0100 Subject: [PATCH 13/15] (*) Refactor GeoIP-related code In this commit, GeoIP-related classes are refactored and relocated to a common library as they are shared across multiple services. The crawler is refactored to enable the GeoIpBlocklist to use the new GeoIpDictionary as the base of its decisions. The converter is modified ot query this data to add a geoip:-keyword to documents to permit limiting a search to the country of the hosting server. The commit also adds due BY-SA attribution in the search engine footer for the source of the IP geolocation data. --- .../crawl-blocklist/build.gradle | 1 + .../ip_blocklist/GeoIpBlocklist.java | 66 ++++--------------- .../ip_blocklist/InetAddressCache.java | 2 +- code/libraries/geo-ip/build.gradle | 24 +++++++ code/libraries/geo-ip/readme.md | 6 ++ .../nu/marginalia/geoip}/GeoIpDictionary.java | 28 +++++--- .../model/processed/DomainRecord.java | 1 - .../model/processed/DomainWithIp.java | 9 +++ .../processes/converting-process/build.gradle | 1 + .../converting/processor/DomainProcessor.java | 15 ++++- ...CrawlingThenConvertingIntegrationTest.java | 3 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 7 +- .../crawl/retreival/CrawlerRetreiver.java | 4 +- .../crawl/retreival/DomainProber.java | 24 ++++--- .../retreival/fetcher/HttpFetcherImpl.java | 6 +- .../retreival/CrawlerMockFetcherTest.java | 7 +- .../retreival/CrawlerRetreiverTest.java | 9 +-- .../templates/search/parts/search-footer.hdb | 8 ++- .../assistant-service/build.gradle | 1 + .../domains/DomainInformationService.java | 3 +- settings.gradle | 1 + 21 files changed, 135 insertions(+), 91 deletions(-) create mode 100644 code/libraries/geo-ip/build.gradle create mode 100644 code/libraries/geo-ip/readme.md rename code/{services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains => libraries/geo-ip/src/main/java/nu/marginalia/geoip}/GeoIpDictionary.java (80%) diff --git a/code/features-crawl/crawl-blocklist/build.gradle b/code/features-crawl/crawl-blocklist/build.gradle index c131e97b..8288aa0c 100644 --- a/code/features-crawl/crawl-blocklist/build.gradle +++ b/code/features-crawl/crawl-blocklist/build.gradle @@ -15,6 +15,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:libraries:guarded-regex') + implementation project(':code:libraries:geo-ip') implementation libs.notnull diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java index ba896317..79ca6847 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java @@ -1,73 +1,31 @@ package nu.marginalia.ip_blocklist; +import com.google.inject.Inject; import com.google.inject.Singleton; -import com.opencsv.CSVReader; -import com.opencsv.exceptions.CsvValidationException; -import lombok.AllArgsConstructor; -import nu.marginalia.WmsaHome; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileReader; -import java.io.IOException; -import java.net.InetAddress; import java.util.Set; -import java.util.TreeMap; @Singleton public class GeoIpBlocklist { - private final TreeMap ranges = new TreeMap<>(); - + /** These countries are extremely overrepresented among the problematic and spammy domains, + * and blocking them is by far the most effective spam mitigation technique. Sucks we throw + * babies out with the bathwater, but it's undeniably effective. + */ private final Set blacklist = Set.of("CN", "HK"); private final Set graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA"); private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class); - @AllArgsConstructor - static class IpRange { - public final long from; - public final long to; - public final String country; - } + private final GeoIpDictionary ipDictionary; - public GeoIpBlocklist() throws IOException, CsvValidationException { - var resource = WmsaHome.getIPLocationDatabse(); - - try (var reader = new CSVReader(new FileReader(resource.toFile()))) { - for (;;) { - String[] vals = reader.readNext(); - if (vals == null) { - break; - } - if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) { - continue; - } - var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]), - Long.parseLong(vals[1]), - vals[2]); - ranges.put(range.from, range); - } - } - - logger.info("Loaded {} IP ranges", ranges.size()); - } - - public String getCountry(InetAddress address) { - byte[] bytes = address.getAddress(); - long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF); - - Long key = ranges.floorKey(ival); - if (null == key) { - return "-"; - } - - var range = ranges.get(key); - if (ival >= key && ival < range.to) { - return range.country; - } - - return "-"; + @Inject + public GeoIpBlocklist(GeoIpDictionary ipDictionary) { + this.ipDictionary = ipDictionary; + ipDictionary.waitReady(); } public boolean isAllowed(EdgeDomain domain) { @@ -85,7 +43,7 @@ public class GeoIpBlocklist { public String getCountry(EdgeDomain domain) { try { - return getCountry(InetAddressCache.getAddress(domain)); + return ipDictionary.getCountry(InetAddressCache.getAddress(domain)); } catch (Throwable ex) { logger.debug("Failed to resolve {}", domain); diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java index 728a1f65..ba9a7948 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java @@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit; // We don't want to torture the DNS by resolving the same links over and over and over again public class InetAddressCache { - private static final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); + private static final Cache cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); public static InetAddress getAddress(EdgeDomain domain) throws Throwable { try { return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress())); diff --git a/code/libraries/geo-ip/build.gradle b/code/libraries/geo-ip/build.gradle new file mode 100644 index 00000000..b0180ef8 --- /dev/null +++ b/code/libraries/geo-ip/build.gradle @@ -0,0 +1,24 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation project(':code:common:config') + + implementation libs.bundles.slf4j + implementation libs.opencsv + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} diff --git a/code/libraries/geo-ip/readme.md b/code/libraries/geo-ip/readme.md new file mode 100644 index 00000000..2a81b04b --- /dev/null +++ b/code/libraries/geo-ip/readme.md @@ -0,0 +1,6 @@ +This micro library handles the GeoIP lookups, mappings from IP addresses +to country codes. + +It uses the free ip2location lite database, which is +available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country) +under a CC-BY-SA 4.0 license. \ No newline at end of file diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java similarity index 80% rename from code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java rename to code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java index e250761e..83789905 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java +++ b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java @@ -1,7 +1,6 @@ -package nu.marginalia.assistant.domains; +package nu.marginalia.geoip; import com.opencsv.CSVReader; -import lombok.AllArgsConstructor; import nu.marginalia.WmsaHome; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -14,12 +13,7 @@ public class GeoIpDictionary { private volatile TreeMap ranges = null; private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class); - @AllArgsConstructor - static class IpRange { - public final long from; - public final long to; - public final String country; - } + record IpRange(long from, long to, String country) {} public GeoIpDictionary() { Thread.ofPlatform().start(() -> { @@ -39,10 +33,28 @@ public class GeoIpDictionary { ranges = dict; logger.info("Loaded {} IP ranges", ranges.size()); } catch (Exception e) { + ranges = new TreeMap<>(); throw new RuntimeException(e); } + finally { + this.notifyAll(); + } }); + } + public boolean isReady() { + return null != ranges; + } + + public boolean waitReady() { + while (null == ranges) { + try { + this.wait(); + } catch (InterruptedException e) { + return false; + } + } + return true; } public String getCountry(String ip) { diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java index 6b3491bf..b696829f 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java @@ -8,7 +8,6 @@ import org.apache.parquet.schema.*; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import java.sql.Array; import java.util.ArrayList; import java.util.List; diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java index bedae4d5..3782b1b2 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java @@ -1,5 +1,14 @@ package nu.marginalia.model.processed; +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.ToString; + +@AllArgsConstructor +@NoArgsConstructor +@EqualsAndHashCode +@ToString public class DomainWithIp { public String domain; public String ip; diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index faa952fb..979260df 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -41,6 +41,7 @@ dependencies { implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:easy-lsh') + implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:big-string') implementation project(':code:libraries:language-processing') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 00a05257..df682d77 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.*; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; @@ -30,6 +31,7 @@ public class DomainProcessor { private final AnchorTagsSource anchorTagsSource; private final AnchorTextKeywords anchorTextKeywords; private final LshDocumentDeduplicator documentDeduplicator; + private final GeoIpDictionary geoIpDictionary; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -38,17 +40,21 @@ public class DomainProcessor { SiteWords siteWords, AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTextKeywords anchorTextKeywords, - LshDocumentDeduplicator documentDeduplicator) throws SQLException + LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException { this.documentProcessor = documentProcessor; this.siteWords = siteWords; this.anchorTextKeywords = anchorTextKeywords; this.documentDeduplicator = documentDeduplicator; this.anchorTagsSource = anchorTagsSourceFactory.create(); + this.geoIpDictionary = geoIpDictionary; + } @SneakyThrows public ProcessedDomain process(SerializableCrawlDataStream dataStream) { + geoIpDictionary.waitReady(); + var ret = new ProcessedDomain(); List docs = new ArrayList<>(); @@ -107,7 +113,14 @@ public class DomainProcessor { // Add late keywords and features from domain-level information List terms = new ArrayList<>(); + terms.add("ip:"+ip); + + String geoIp = geoIpDictionary.getCountry(ip); + if (!geoIp.isBlank()) { + terms.add("geoip:"+geoIp.toLowerCase()); + } + if (cookies) { terms.add(HtmlFeature.COOKIES.getKeyword()); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 58d8a486..7ef056d2 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -6,6 +6,7 @@ import lombok.SneakyThrows; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.SerializableCrawlDataStream; @@ -75,7 +76,7 @@ public class CrawlingThenConvertingIntegrationTest { private CrawledDomain crawl(CrawlSpecRecord specs) { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index dc76abde..f824d815 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.crawl.spec.DbCrawlSpecProvider; @@ -56,6 +57,7 @@ public class CrawlerMain { private final UserAgent userAgent; private final MessageQueueFactory messageQueueFactory; + private final DomainProber domainProber; private final FileStorageService fileStorageService; private final DbCrawlSpecProvider dbCrawlSpecProvider; private final AnchorTagsSourceFactory anchorTagsSourceFactory; @@ -75,7 +77,7 @@ public class CrawlerMain { @Inject public CrawlerMain(UserAgent userAgent, ProcessHeartbeatImpl heartbeat, - MessageQueueFactory messageQueueFactory, + MessageQueueFactory messageQueueFactory, DomainProber domainProber, FileStorageService fileStorageService, ProcessConfiguration processConfiguration, DbCrawlSpecProvider dbCrawlSpecProvider, @@ -84,6 +86,7 @@ public class CrawlerMain { this.heartbeat = heartbeat; this.userAgent = userAgent; this.messageQueueFactory = messageQueueFactory; + this.domainProber = domainProber; this.fileStorageService = fileStorageService; this.dbCrawlSpecProvider = dbCrawlSpecProvider; this.anchorTagsSourceFactory = anchorTagsSourceFactory; @@ -219,7 +222,7 @@ public class CrawlerMain { var domainLinks = anchorTagsSource.getAnchorTags(domain); - var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); + var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept); int size = retreiver.fetch(domainLinks, reference); workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ce5ecb89..b32e0b6c 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -42,7 +42,7 @@ public class CrawlerRetreiver { private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); - private static final DomainProber domainProber = new DomainProber(); + private final DomainProber domainProber; private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; @@ -55,9 +55,11 @@ public class CrawlerRetreiver { private static final String documentWasSameTag = "SAME-BY-COMPARISON"; public CrawlerRetreiver(HttpFetcher fetcher, + DomainProber domainProber, CrawlSpecRecord specs, Consumer writer) { this.fetcher = fetcher; + this.domainProber = domainProber; domain = specs.domain; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java index 67f006d4..fcc005a8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java @@ -1,5 +1,7 @@ package nu.marginalia.crawl.retreival; +import com.google.inject.Inject; +import com.google.inject.Singleton; import nu.marginalia.crawl.retreival.fetcher.FetchResultState; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawling.model.CrawlerDomainStatus; @@ -11,17 +13,21 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.util.function.Predicate; +@Singleton public class DomainProber { private final Logger logger = LoggerFactory.getLogger(DomainProber.class); - private static IpBlockList ipBlockList; + private final Predicate domainBlacklist; - static { - try { - ipBlockList = new IpBlockList(new GeoIpBlocklist()); - } catch (Exception e) { - throw new RuntimeException(e); - } + @Inject + public DomainProber(IpBlockList ipBlockList) { + this.domainBlacklist = ipBlockList::isAllowed; + } + + /** For testing */ + public DomainProber(Predicate domainBlacklist) { + this.domainBlacklist = domainBlacklist; } /** To detect problems early we do a probing request to the domain before we start crawling it properly. @@ -37,7 +43,7 @@ public class DomainProber { return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs"); } - if (!ipBlockList.isAllowed(firstUrlInQueue.domain)) + if (!domainBlacklist.test(firstUrlInQueue.domain)) return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed"); var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null)); @@ -62,7 +68,7 @@ public class DomainProber { /** This domain redirects to another domain */ public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {} - /** If the retreivala of the probed url was successful, return the url as it was fetched + /** If the retrieval of the probed url was successful, return the url as it was fetched * (which may be different from the url we probed, if we attempted another URL schema). * * @param probedUrl The url we successfully probed diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 041ae08d..5720ef34 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -15,7 +15,6 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.crawl.retreival.logic.ContentTypeParser; import okhttp3.*; -import org.apache.commons.collections4.queue.PredicatedQueue; import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; @@ -87,7 +86,10 @@ public class HttpFetcherImpl implements HttpFetcher { } @Inject - public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) { + public HttpFetcherImpl(@Named("user-agent") String userAgent, + Dispatcher dispatcher, + ConnectionPool connectionPool) + { this.client = createClient(dispatcher, connectionPool); this.userAgent = userAgent; } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index c0df397f..b65e5ae6 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.*; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; @@ -68,7 +69,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add) .fetch(); out.forEach(System.out::println); @@ -80,7 +81,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add) .fetch(); out.forEach(System.out::println); @@ -94,7 +95,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add) .fetch(); out.forEach(System.out::println); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 147aca68..e7742445 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.CrawledDomainReader; @@ -53,7 +54,7 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); var fetchedUrls = data.stream().filter(CrawledDocument.class::isInstance) @@ -82,7 +83,7 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); data.stream().filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) @@ -118,7 +119,7 @@ class CrawlerRetreiverTest { var writer = new CrawledDomainWriter(out, specs.domain, "idid"); Map, List> data = new HashMap<>(); - new CrawlerRetreiver(httpFetcher, specs, d -> { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> { data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); @@ -136,7 +137,7 @@ class CrawlerRetreiverTest { CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); - new CrawlerRetreiver(httpFetcher, specs, d -> { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb index f911d3db..88b6ad84 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb @@ -99,7 +99,6 @@ + diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index e2c792fb..8609903d 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -33,6 +33,7 @@ dependencies { implementation project(':code:features-search:screenshots') + implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java index 4da309dc..690509db 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java @@ -1,9 +1,8 @@ package nu.marginalia.assistant.domains; import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.assistant.client.model.DomainInformation; import org.slf4j.Logger; diff --git a/settings.gradle b/settings.gradle index 952acd9c..342107de 100644 --- a/settings.gradle +++ b/settings.gradle @@ -12,6 +12,7 @@ include 'code:services-application:dating-service' include 'code:services-application:explorer-service' include 'code:libraries:array' +include 'code:libraries:geo-ip' include 'code:libraries:btree' include 'code:libraries:easy-lsh' include 'code:libraries:guarded-regex' From 30bc3f9281962455b373eb254db6bced5f166039 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 11 Dec 2023 13:59:23 +0100 Subject: [PATCH 14/15] (converter) Use the prefix ip: instead of geopip: for country codes This is the same as the prefix for the IP address, but I don't think that substantially matters, the as two have such different namespaces there can be no confusion. --- .../marginalia/converting/processor/DomainProcessor.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index df682d77..fc824906 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -49,12 +49,11 @@ public class DomainProcessor { this.anchorTagsSource = anchorTagsSourceFactory.create(); this.geoIpDictionary = geoIpDictionary; + geoIpDictionary.waitReady(); } @SneakyThrows public ProcessedDomain process(SerializableCrawlDataStream dataStream) { - geoIpDictionary.waitReady(); - var ret = new ProcessedDomain(); List docs = new ArrayList<>(); @@ -116,9 +115,9 @@ public class DomainProcessor { terms.add("ip:"+ip); - String geoIp = geoIpDictionary.getCountry(ip); - if (!geoIp.isBlank()) { - terms.add("geoip:"+geoIp.toLowerCase()); + String ipCountryCode = geoIpDictionary.getCountry(ip).toLowerCase(); + if (!ipCountryCode.isBlank()) { + terms.add("ip:"+ipCountryCode); } if (cookies) { From 8f0950fc44d312a58c3aca3bf21a7a9e43af4ddb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 11 Dec 2023 14:01:39 +0100 Subject: [PATCH 15/15] (geoip) Fix incorrect synchronization. --- .../src/main/java/nu/marginalia/geoip/GeoIpDictionary.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java index 83789905..13b982f5 100644 --- a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java +++ b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java @@ -49,7 +49,9 @@ public class GeoIpDictionary { public boolean waitReady() { while (null == ranges) { try { - this.wait(); + synchronized (this) { + this.wait(1000); + } } catch (InterruptedException e) { return false; }