Merge branch 'master' into term-positions

# Conflicts: # code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java # code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java # code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java # code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java # code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java # code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java # code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java # code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java
2025-02-23 21:18:58 +00:00 · 2024-09-08 10:12:53 +02:00 · 2024-09-08 10:12:53 +02:00 · 8f367d96f8
commit 8f367d96f8
parent f78ef36cd4 7a69dff6cf
26 changed files with 835 additions and 108 deletions
--- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
@ -34,7 +34,6 @@ import org.apache.logging.log4j.util.Strings;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
@ -201,7 +200,7 @@ public class ConverterMain extends ProcessMainClass {
            try {
                return Optional.of(CrawledDomainReader.createDataStream(path));
            }
-            catch (IOException ex) {
+            catch (Exception ex) {
                return Optional.empty();
            }
        }
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java
@ -151,9 +151,9 @@ public class RedditSideloader implements SideloadSource {
        var doc = sideloaderProcessing
                .processDocument(fullUrl,
                        fullHtml,
-                        List.of("encyclopedia", "wiki"),
+                        List.of("reddit"),
                        domainLinks,
-                        GeneratorType.WIKI,
+                        GeneratorType.FORUM,
                        DocumentClass.SIDELOAD,
                        anchorTextKeywords.getAnchorTextKeywords(domainLinks, urls),
                        pubYear,
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -9,6 +9,9 @@ import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
 import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
 import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawling.model.CrawledDomain;
+import nu.marginalia.crawling.model.CrawlerDomainStatus;
 import nu.marginalia.ip_blocklist.UrlBlocklist;
 import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
@ -28,6 +31,7 @@ import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
+import java.util.Optional;
 import java.util.concurrent.TimeUnit;

 public class CrawlerRetreiver implements AutoCloseable {
@ -88,17 +92,8 @@ public class CrawlerRetreiver implements AutoCloseable {
    }

    public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
-        final DomainProber.ProbeResult probeResult = domainProber.probeDomain(
-                fetcher,
-                domain,
-                new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
-
        try {
-            // Sleep a bit to avoid hammering the server with requests, we just probed it
-            TimeUnit.SECONDS.sleep(1);
-
-            // Fetch the domain
-            return crawlDomain(oldCrawlData, probeResult, domainLinks);
+            return crawlDomain(oldCrawlData, domainLinks);
        }
        catch (Exception ex) {
            logger.error("Error crawling domain {}", domain, ex);
@ -112,25 +107,33 @@ public class CrawlerRetreiver implements AutoCloseable {
        resync.run(warcFile);
    }

-    private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException {
-        String ip = findIp(domain);
-        EdgeUrl rootUrl;
+    private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
+        // Construct an URL to the root of the domain, we don't know the schema yet so we'll
+        // start with http and then try https if that fails
+        var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
+        final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);

        warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);

-        if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) {
-            return 1;
-        }
-        else {
-            rootUrl = ok.probedUrl();
-        }
+        return probeResult;
+    }
+
+    private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
+        String ip = findIp(domain);
+        EdgeUrl rootUrl;
+
+        if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl();
+        else return 1;
+
+        // Sleep after the initial probe, we don't have access to the robots.txt yet
+        // so we don't know the crawl delay
+        TimeUnit.SECONDS.sleep(1);

        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
        final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());

        delayTimer.waitFetchDelay(0); // initial delay after robots.txt
        sniffRootDocument(rootUrl, delayTimer);
-        delayTimer.waitFetchDelay(0); // delay after sniffing

        // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
        int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
@ -188,7 +191,7 @@ public class CrawlerRetreiver implements AutoCloseable {


            try {
-                if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
+                if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
                    fetchedCount++;
                }
            }
@ -209,21 +212,8 @@ public class CrawlerRetreiver implements AutoCloseable {

            var url = rootUrl.withPathAndParam("/", null);

-            HttpFetchResult result = null;
-
-            for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
-                try {
-                    result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
-                    break;
-                }
-                catch (RateLimitException ex) {
-                    timer.waitRetryDelay(ex);
-                }
-                catch (Exception ex) {
-                    logger.warn("Failed to fetch {}", url, ex);
-                    result = new HttpFetchResult.ResultException(ex);
-                }
-            }
+            HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
+            timer.waitFetchDelay(0);

            if (!(result instanceof HttpFetchResult.ResultOk ok))
                return;
@ -236,24 +226,40 @@ public class CrawlerRetreiver implements AutoCloseable {
            var doc = optDoc.get();
            crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));

+            EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
+            Optional<EdgeUrl> sitemapUrl = Optional.empty();
+
            for (var link : doc.getElementsByTag("link")) {
                String rel = link.attr("rel");
                String type = link.attr("type");

-                if (!rel.equalsIgnoreCase("alternate"))
-                    continue;
+                if (rel.equals("icon") || rel.equals("shortcut icon")) {
+                    String href = link.attr("href");

-                if (!(type.equalsIgnoreCase("application/atom+xml")
-                   || type.equalsIgnoreCase("application/rss+xml")))
-                    continue;
+                    faviconUrl = linkParser.parseLink(url, href)
+                            .filter(crawlFrontier::isSameDomain)
+                            .orElse(faviconUrl);
+                }

-                String href = link.attr("href");
+                // Grab the RSS/Atom as a sitemap if it exists
+                if (rel.equalsIgnoreCase("alternate")
+                && (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) {
+                    String href = link.attr("href");

-                linkParser.parseLink(url, href)
-                        .filter(crawlFrontier::isSameDomain)
-                        .map(List::of)
-                        .ifPresent(sitemapFetcher::downloadSitemaps);
+                    sitemapUrl = linkParser.parseLink(url, href)
+                            .filter(crawlFrontier::isSameDomain);
+                }
            }
+
+            // Download the sitemap if available exists
+            if (sitemapUrl.isPresent()) {
+                sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get()));
+                timer.waitFetchDelay(0);
+            }
+
+            // Grab the favicon if it exists
+            fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
+            timer.waitFetchDelay(0);
        }
        catch (Exception ex) {
            logger.error("Error configuring link filter", ex);
@ -263,31 +269,16 @@ public class CrawlerRetreiver implements AutoCloseable {
        }
    }

-    public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
-                                              CrawlDelayTimer timer,
-                                              DocumentWithReference reference) throws InterruptedException
+    public HttpFetchResult fetchContentWithReference(EdgeUrl top,
+                                                     CrawlDelayTimer timer,
+                                                     DocumentWithReference reference) throws InterruptedException
    {
        logger.debug("Fetching {}", top);

-        HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
-
        long startTime = System.currentTimeMillis();
        var contentTags = reference.getContentTags();

-        // Fetch the document, retrying if we get a rate limit exception
-        for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
-            try {
-                fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
-                break;
-            }
-            catch (RateLimitException ex) {
-                timer.waitRetryDelay(ex);
-            }
-            catch (Exception ex) {
-                logger.warn("Failed to fetch {}", top, ex);
-                fetchedDoc = new HttpFetchResult.ResultException(ex);
-            }
-        }
+        HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);

        // Parse the document and enqueue links
        try {
@ -329,6 +320,27 @@ public class CrawlerRetreiver implements AutoCloseable {
        return fetchedDoc;
    }

+    /** Fetch a document and retry on 429s */
+    private HttpFetchResult fetchWithRetry(EdgeUrl url,
+                                           CrawlDelayTimer timer,
+                                           HttpFetcher.ProbeType probeType,
+                                           ContentTags contentTags) throws InterruptedException {
+        for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
+            try {
+                return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
+            }
+            catch (RateLimitException ex) {
+                timer.waitRetryDelay(ex);
+            }
+            catch (Exception ex) {
+                logger.warn("Failed to fetch {}", url, ex);
+                return new HttpFetchResult.ResultException(ex);
+            }
+        }
+
+        return new HttpFetchResult.ResultNone();
+    }
+
    private boolean isAllowedProtocol(String proto) {
        return proto.equalsIgnoreCase("http")
                || proto.equalsIgnoreCase("https");
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
@ -4,6 +4,7 @@ import com.google.inject.ImplementedBy;
 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.retreival.RateLimitException;
 import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawling.body.HttpFetchResult;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
@ -19,9 +20,18 @@ public interface HttpFetcher {

    FetchResult probeDomain(EdgeUrl url);

-    HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException;
+    HttpFetchResult fetchContent(EdgeUrl url,
+                                 WarcRecorder recorder,
+                                 ContentTags tags,
+                                 ProbeType probeType) throws RateLimitException;

    SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);

    SitemapRetriever createSitemapRetriever();
+
+    enum ProbeType {
+        DISABLED,
+        FULL,
+        IF_MODIFIED_SINCE
+    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
@ -12,6 +12,9 @@ import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory
 import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
 import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
 import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawling.body.ContentTypeLogic;
+import nu.marginalia.crawling.body.DocumentBodyExtractor;
+import nu.marginalia.crawling.body.HttpFetchResult;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.ContentTypeLogic;
@ -145,12 +148,13 @@ public class HttpFetcherImpl implements HttpFetcher {
    @SneakyThrows
    public HttpFetchResult fetchContent(EdgeUrl url,
                                           WarcRecorder warcRecorder,
-                                           ContentTags contentTags)
+                                           ContentTags contentTags,
+                                           ProbeType probeType)
    {

        // We don't want to waste time and resources on URLs that are not HTML, so if the file ending
        // looks like it might be something else, we perform a HEAD first to check the content type
-        if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
+        if (probeType == ProbeType.FULL && contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
        {
            ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url);
            if (probeResult instanceof ContentTypeProbeResult.Ok ok) {
@ -174,7 +178,9 @@ public class HttpFetcherImpl implements HttpFetcher {
        else {
            // Possibly do a soft probe to see if the URL has been modified since the last time we crawled it
            // if we have reason to suspect ETags are not supported by the server.
-            if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) {
+            if (probeType == ProbeType.IF_MODIFIED_SINCE
+              && softIfModifiedSinceProber.probeModificationTime(url, contentTags))
+            {
                return new HttpFetchResult.Result304Raw();
            }
        }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@ -137,7 +137,7 @@ public class CrawlerRevisitor {

                DocumentWithReference reference =  new DocumentWithReference(doc, oldCrawlData);

-                var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference);
+                var result = crawlerRetreiver.fetchContentWithReference(url, delayTimer, reference);

                if (reference.isSame(result)) {
                    retained++;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java
@ -46,22 +46,35 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {

        blacklist.waitUntilLoaded();

+        List<Integer> domainIds = new ArrayList<>(10_000);
+
        try (var conn = dataSource.getConnection();
+             var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
             var query = conn.prepareStatement("""
                     SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID
                     FROM EC_DOMAIN
                     LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
-                     WHERE NODE_AFFINITY=?
-                     """))
+                     WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0
+                     """)
+             )
        {
+
+            // Assign any domains with node_affinity=0 to this node.  We must do this now, before we start crawling
+            // to avoid race conditions with other crawl runs.  We don't want multiple crawlers to crawl the same domain.
+            assignFreeDomains.setInt(1, processConfiguration.node());
+            assignFreeDomains.executeUpdate();
+
+            // Fetch the domains to be crawled
            query.setInt(1, processConfiguration.node());
            query.setFetchSize(10_000);
            var rs = query.executeQuery();

            while (rs.next()) {
                // Skip blacklisted domains
-                if (blacklist.isBlacklisted(rs.getInt(3)))
+                int id = rs.getInt(3);
+                if (blacklist.isBlacklisted(id))
                    continue;
+                domainIds.add(id);

                int urls = rs.getInt(2);
                double growthFactor;
@ -83,6 +96,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {

                domains.add(record);
            }
+
        }

        logger.info("Loaded {} domains", domains.size());
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java
@ -1,6 +1,9 @@
 package nu.marginalia.io.crawldata;

 import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
+import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.io.FileNotFoundException;
 import java.io.IOException;
@ -8,16 +11,23 @@ import java.nio.file.Files;
 import java.nio.file.Path;

 public class CrawledDomainReader {
+    private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);

    /** An iterator-like access to domain data  This must be closed otherwise it will leak off-heap memory! */
    public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
    {
+
        String fileName = fullPath.getFileName().toString();
        if (fileName.endsWith(".parquet")) {
-            return new ParquetSerializableCrawlDataStream(fullPath);
-        }
-        else {
-            throw new IllegalArgumentException("Unknown file type: " + fullPath);
+            try {
+                return new ParquetSerializableCrawlDataStream(fullPath);
+            } catch (Exception ex) {
+                logger.error("Error reading domain data from " + fullPath, ex);
+                return SerializableCrawlDataStream.empty();
+            }
+        } else {
+            logger.error("Unknown file type: {}", fullPath);
+            return SerializableCrawlDataStream.empty();
        }
    }

--- a/code/processes/crawling-process/resources/ip-banned-cidr.txt
+++ b/code/processes/crawling-process/resources/ip-banned-cidr.txt
@ -12,13 +12,10 @@

 # Cloud Yuqu LLC
 172.247.0.0/16
-
 107.151.64.0/18

-# Google Cloud
-# 35.208.0.0/12
-# 35.224.0.0/12
-# 35.240.0.0/13
-
 # 1Blu
-178.254.10.0/23
+178.254.10.0/23
+
+# Domain parking spam
+199.59.243.0/24
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
@ -3,12 +3,13 @@ package nu.marginalia.crawling;
 import lombok.SneakyThrows;
 import nu.marginalia.crawl.retreival.RateLimitException;
 import nu.marginalia.crawl.retreival.fetcher.ContentTags;
+import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawling.body.ContentTypeLogic;
+import nu.marginalia.crawling.body.DocumentBodyExtractor;
+import nu.marginalia.crawling.body.DocumentBodyResult;
 import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.model.body.ContentTypeLogic;
-import nu.marginalia.model.body.DocumentBodyExtractor;
-import nu.marginalia.model.body.DocumentBodyResult;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;

@ -35,7 +36,7 @@ class HttpFetcherTest {
    void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
        try (var recorder = new WarcRecorder()) {
-            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty());
+            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
            if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
                System.out.println(bodyOk.contentType());
            }
@ -47,7 +48,7 @@ class HttpFetcherTest {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");

        try (var recorder = new WarcRecorder()) {
-            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty());
+            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
            if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
                System.out.println(bodyOk.contentType());
            }
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@ -122,7 +122,7 @@ public class CrawlerMockFetcherTest {

        @SneakyThrows
        @Override
-        public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
+        public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) {
            logger.info("Fetching {}", url);
            if (mockData.containsKey(url)) {
                byte[] bodyBytes = mockData.get(url).documentBody.getBytes();
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@ -261,6 +261,7 @@ class CrawlerRetreiverTest {
                        .collect(Collectors.toSet());

        assertEquals(Set.of("https://www.marginalia.nu/",
+                            "https://www.marginalia.nu/favicon.ico",
                            "https://www.marginalia.nu/log/06-optimization.gmi/"),
                    fetchedUrls);

--- a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java
@ -12,6 +12,7 @@ import nu.marginalia.bbpc.BrailleBlockPunchCards;
 import nu.marginalia.db.DbDomainQueries;
 import nu.marginalia.index.query.limit.QueryLimits;
 import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.DomainIndexingState;
 import nu.marginalia.search.command.SearchParameters;
 import nu.marginalia.search.model.ClusteredUrlDetails;
@ -162,7 +163,7 @@ public class SearchOperator {
        return new UrlDetails(
                item.documentId(),
                item.domainId(),
-                item.url,
+                cleanUrl(item.url),
                item.title,
                item.description,
                item.format,
@ -177,6 +178,31 @@ public class SearchOperator {
        );
    }

+    /** Replace nuisance domains with replacements where available */
+    private static EdgeUrl cleanUrl(EdgeUrl url) {
+        String topdomain = url.domain.topDomain;
+        String subdomain = url.domain.subDomain;
+        String path = url.path;
+
+        if (topdomain.equals("fandom.com")) {
+            int wikiIndex = path.indexOf("/wiki/");
+            if (wikiIndex >= 0) {
+                return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null,  "/" + subdomain + path.substring(wikiIndex), null);
+            }
+        }
+        else if (topdomain.equals("medium.com")) {
+            if (!subdomain.isBlank()) {
+                return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
+            }
+            else {
+                String article = path.substring(path.indexOf("/", 1));
+                return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
+            }
+
+        }
+        return url;
+    }
+
    @SneakyThrows
    private List<String> getProblems(String evalResult, List<UrlDetails> queryResults, QueryResponse response) {

--- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java
--- a/code/services-core/control-service/build.gradle
+++ b/code/services-core/control-service/build.gradle
@ -54,6 +54,7 @@ dependencies {
    implementation libs.handlebars

    implementation libs.duckdb
+    implementation libs.jsoup

    implementation libs.trove
    implementation dependencies.create(libs.spark.get()) {
--- a/code/services-core/control-service/java/nu/marginalia/control/ControlService.java
+++ b/code/services-core/control-service/java/nu/marginalia/control/ControlService.java
@ -2,16 +2,18 @@ package nu.marginalia.control;

 import com.google.gson.Gson;
 import com.google.inject.Inject;
-import nu.marginalia.service.ServiceMonitors;
 import nu.marginalia.control.actor.ControlActorService;
 import nu.marginalia.control.app.svc.*;
-import nu.marginalia.control.node.svc.ControlNodeActionsService;
 import nu.marginalia.control.node.svc.ControlFileStorageService;
+import nu.marginalia.control.node.svc.ControlNodeActionsService;
 import nu.marginalia.control.node.svc.ControlNodeService;
 import nu.marginalia.control.sys.svc.*;
 import nu.marginalia.model.gson.GsonFactory;
 import nu.marginalia.screenshot.ScreenshotService;
-import nu.marginalia.service.server.*;
+import nu.marginalia.service.ServiceMonitors;
+import nu.marginalia.service.server.BaseServiceParams;
+import nu.marginalia.service.server.Service;
+import nu.marginalia.service.server.StaticResources;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import spark.Request;
@ -19,7 +21,7 @@ import spark.Response;
 import spark.Spark;

 import java.io.IOException;
-import java.util.*;
+import java.util.Map;

 public class ControlService extends Service {

@ -56,6 +58,7 @@ public class ControlService extends Service {
                          ControlDomainRankingSetsService controlDomainRankingSetsService,
                          ControlActorService controlActorService,
                          AbortedProcessService abortedProcessService,
+                          DomainsManagementService domainsManagementService,
                          ControlErrorHandler errorHandler
                      ) throws IOException {

@ -84,6 +87,7 @@ public class ControlService extends Service {
        apiKeyService.register();
        domainComplaintService.register();
        randomExplorationService.register();
+        domainsManagementService.register();

        errorHandler.register();

--- a/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainModel.java
+++ b/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainModel.java
@ -0,0 +1,40 @@
+package nu.marginalia.control.app.model;
+
+public record DomainModel(int id,
+                          String name,
+                          String ip,
+                          int nodeAffinity,
+                          double rank,
+                          boolean blacklisted) {
+
+    public boolean isUnassigned() {
+        return nodeAffinity < 0;
+    }
+
+    public DomainAffinityState getAffinityState() {
+        if (nodeAffinity < 0) {
+            return DomainAffinityState.Known;
+        }
+        else if (nodeAffinity == 0) {
+            return DomainAffinityState.Scheduled;
+        }
+        else {
+            return DomainAffinityState.Assigned;
+        }
+    }
+
+    public enum DomainAffinityState {
+        Assigned("The domain has been assigned to a node."),
+        Scheduled("The domain will be assigned to the next crawling node."),
+        Known("The domain is known but not yet scheduled for crawling.");
+
+        private final String desc;
+        DomainAffinityState(String desc) {
+            this.desc = desc;
+        }
+
+        public String getDesc() {
+            return desc;
+        }
+    }
+}
--- a/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainSearchResultModel.java
+++ b/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainSearchResultModel.java
@ -0,0 +1,26 @@
+package nu.marginalia.control.app.model;
+
+import java.util.List;
+import java.util.Map;
+
+public record DomainSearchResultModel(String query,
+                                      String affinity,
+                                      String field,
+                                      Map<String, Boolean> selectedAffinity,
+                                      Map<String, Boolean> selectedField,
+                                      int page,
+                                      boolean hasNext,
+                                      boolean hasPrevious,
+                                      List<Integer> nodes,
+                                      List<DomainModel> results)
+{
+    public Integer getNextPage() {
+        if (!hasNext) return null;
+        return page + 1;
+    }
+
+    public Integer getPreviousPage() {
+        if (!hasPrevious) return null;
+        return page - 1;
+    }
+}
--- a/code/services-core/control-service/java/nu/marginalia/control/app/svc/DomainsManagementService.java
+++ b/code/services-core/control-service/java/nu/marginalia/control/app/svc/DomainsManagementService.java
@ -0,0 +1,310 @@
+package nu.marginalia.control.app.svc;
+
+import com.google.inject.Inject;
+import com.zaxxer.hikari.HikariDataSource;
+import nu.marginalia.control.ControlRendererFactory;
+import nu.marginalia.control.Redirects;
+import nu.marginalia.control.app.model.DomainModel;
+import nu.marginalia.control.app.model.DomainSearchResultModel;
+import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.nodecfg.NodeConfigurationService;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Element;
+import spark.Request;
+import spark.Response;
+import spark.Spark;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.sql.SQLException;
+import java.util.*;
+
+public class DomainsManagementService {
+
+    private final HikariDataSource dataSource;
+    private final NodeConfigurationService nodeConfigurationService;
+    private final ControlRendererFactory rendererFactory;
+
+    @Inject
+    public DomainsManagementService(HikariDataSource dataSource,
+                                    NodeConfigurationService nodeConfigurationService,
+                                    ControlRendererFactory rendererFactory
+                         ) {
+        this.dataSource = dataSource;
+        this.nodeConfigurationService = nodeConfigurationService;
+        this.rendererFactory = rendererFactory;
+    }
+
+    public void register() throws IOException {
+
+        var domainsViewRenderer = rendererFactory.renderer("control/app/domains");
+        var addDomainsTxtViewRenderer = rendererFactory.renderer("control/app/domains-new");
+        var addDomainsUrlViewRenderer = rendererFactory.renderer("control/app/domains-new-url");
+        var addDomainsAfterReportRenderer = rendererFactory.renderer("control/app/domains-new-report");
+
+        Spark.get("/domain", this::getDomains, domainsViewRenderer::render);
+        Spark.get("/domain/new", this::addDomainsTextfield, addDomainsTxtViewRenderer::render);
+        Spark.post("/domain/new", this::addDomainsTextfield, addDomainsAfterReportRenderer::render);
+        Spark.get("/domain/new-url", this::addDomainsFromDownload, addDomainsUrlViewRenderer::render);
+        Spark.post("/domain/new-url", this::addDomainsFromDownload, addDomainsAfterReportRenderer::render);
+        Spark.post("/domain/:id/assign/:node", this::assignDomain, new Redirects.HtmlRedirect("/domain"));
+
+    }
+
+    private Object addDomainsTextfield(Request request, Response response) throws SQLException {
+        if ("GET".equals(request.requestMethod())) {
+            return "";
+        }
+        else if ("POST".equals(request.requestMethod())) {
+            String nodeStr = request.queryParams("node");
+            String domainsStr = request.queryParams("domains");
+
+            int node = Integer.parseInt(nodeStr);
+
+            List<EdgeDomain> validDomains;
+            List<String> invalidDomains;
+
+            Map.Entry<List<EdgeDomain>, List<String>> domainsList = parseDomainsList(domainsStr);
+
+            validDomains = domainsList.getKey();
+            invalidDomains = domainsList.getValue();
+
+            insertDomains(validDomains, node);
+
+            return Map.of("validDomains", validDomains,
+                          "invalidDomains", invalidDomains);
+        }
+        return "";
+    }
+
+    private Map.Entry<List<EdgeDomain>, List<String>> parseDomainsList(String domainsStr) {
+        List<EdgeDomain> validDomains = new ArrayList<>();
+        List<String> invalidDomains = new ArrayList<>();
+
+        for (String domain : domainsStr.split("\n+")) {
+            domain = domain.trim();
+            if (domain.isBlank()) continue;
+            if (domain.length() > 255) {
+                invalidDomains.add(domain);
+                continue;
+            }
+            if (domain.startsWith("#")) {
+                continue;
+            }
+
+            // Run through the URI parser to check for bad domains
+            try {
+                if (domain.contains(":")) {
+                    domain = new URI(domain ).toURL().getHost();
+                }
+                else {
+                    domain = new URI("https://" + domain + "/").toURL().getHost();
+                }
+            } catch (URISyntaxException | MalformedURLException e) {
+                invalidDomains.add(domain);
+                continue;
+            }
+
+            validDomains.add(new EdgeDomain(domain));
+        }
+
+        return Map.entry(validDomains, invalidDomains);
+    }
+
+    private Object addDomainsFromDownload(Request request, Response response) throws SQLException, URISyntaxException, IOException, InterruptedException {
+        if ("GET".equals(request.requestMethod())) {
+            return "";
+        }
+        else if ("POST".equals(request.requestMethod())) {
+            String nodeStr = request.queryParams("node");
+            URI domainsUrl = new URI(request.queryParams("url"));
+
+            int node = Integer.parseInt(nodeStr);
+
+            HttpClient client = HttpClient.newBuilder().build();
+            var httpReq = HttpRequest.newBuilder(domainsUrl).GET().build();
+
+
+            HttpResponse<String> result = client.send(httpReq, HttpResponse.BodyHandlers.ofString());
+            if (result.statusCode() != 200) {
+                return Map.of("error", "Failed to download domains");
+            }
+            Optional<String> ct = result.headers().firstValue("Content-Type");
+            if (ct.isEmpty()) {
+                return Map.of("error", "No content type");
+            }
+
+            List<EdgeDomain> validDomains = new ArrayList<>();
+            List<String> invalidDomains = new ArrayList<>();
+
+            String contentType = ct.get().toLowerCase();
+
+            if (contentType.startsWith("text/plain")) {
+                var parsedDomains = parseDomainsList(result.body());
+                validDomains = parsedDomains.getKey();
+                invalidDomains = parsedDomains.getValue();
+            }
+            else {
+                for (Element e : Jsoup.parse(result.body()).select("a")) {
+                    String s = e.attr("href");
+                    if (s.isBlank()) continue;
+                    if (!s.contains("://")) continue;
+
+                    URI uri = URI.create(s);
+                    String scheme = uri.getScheme();
+                    String host = uri.getHost();
+
+                    if (scheme == null || host == null)
+                        continue;
+                    if (!scheme.equalsIgnoreCase("http") && !scheme.equalsIgnoreCase("https"))
+                        continue;
+
+                    validDomains.add(new EdgeDomain(host));
+                }
+            }
+
+
+            insertDomains(validDomains, node);
+
+
+            return Map.of("validDomains", validDomains,
+                    "invalidDomains", invalidDomains);
+        }
+        return "";
+    }
+
+    private void insertDomains(List<EdgeDomain> domains, int node) throws SQLException {
+
+        // Insert the domains into the database, updating the node affinity if the domain already exists and the affinity is not already set to a node
+        try (var conn = dataSource.getConnection();
+             var stmt = conn.prepareStatement("""
+                        INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
+                        VALUES (?, ?, ?)
+                        ON DUPLICATE KEY UPDATE NODE_AFFINITY = IF(NODE_AFFINITY<=0, VALUES(NODE_AFFINITY), NODE_AFFINITY)
+                        """))
+        {
+            for (var domain : domains) {
+                stmt.setString(1, domain.toString());
+                stmt.setString(2, domain.getTopDomain());
+                stmt.setInt(3, node);
+                stmt.addBatch();
+            }
+            stmt.executeBatch();
+        }
+    }
+
+
+    private Object assignDomain(Request request, Response response) throws SQLException {
+
+        String idStr = request.params(":id");
+        String nodeStr = request.params(":node");
+
+        int id = Integer.parseInt(idStr);
+        int node = Integer.parseInt(nodeStr);
+
+        try (var conn = dataSource.getConnection();
+             var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE ID = ?"))
+        {
+            stmt.setInt(1, node);
+            stmt.setInt(2, id);
+            stmt.executeUpdate();
+        }
+
+        return "";
+    }
+
+    private DomainSearchResultModel getDomains(Request request, Response response) throws SQLException {
+        List<DomainModel> ret = new ArrayList<>();
+
+        String filterRaw = Objects.requireNonNullElse(request.queryParams("filter"), "*");
+
+        String filter;
+        if (filterRaw.isBlank()) filter = "%";
+        else filter = filterRaw.replace('*', '%');
+
+        int page = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "0"));
+        boolean hasMore = false;
+        int count = 10;
+
+        String field = Objects.requireNonNullElse(request.queryParams("field"), "domain");
+        Map<String, Boolean> selectedField = Map.of(field, true);
+
+        String affinity = Objects.requireNonNullElse(request.queryParams("affinity"), "all");
+        Map<String, Boolean> selectedAffinity = Map.of(affinity, true);
+
+        StringJoiner queryJoiner = new StringJoiner(" ");
+        queryJoiner.add("""
+                SELECT EC_DOMAIN.ID,
+                       DOMAIN_NAME,
+                       NODE_AFFINITY,
+                       `RANK`,
+                       IP,
+                       EC_DOMAIN_BLACKLIST.URL_DOMAIN IS NOT NULL AS BLACKLISTED
+                FROM WMSA_prod.EC_DOMAIN
+                LEFT JOIN WMSA_prod.EC_DOMAIN_BLACKLIST ON DOMAIN_NAME = EC_DOMAIN_BLACKLIST.URL_DOMAIN
+                """)
+        .add((switch (field) {
+            case "domain" -> "WHERE DOMAIN_NAME LIKE ?";
+            case "ip" -> "WHERE IP LIKE ?";
+            case "id" -> "WHERE EC_DOMAIN.ID = ?";
+            default -> "WHERE DOMAIN_NAME LIKE ?";
+        }))
+        .add((switch (affinity) {
+            case "assigned" -> "AND NODE_AFFINITY > 0";
+            case "scheduled" -> "AND NODE_AFFINITY = 0";
+            case "unassigned" -> "AND NODE_AFFINITY < 0";
+            default -> "";
+        }))
+        .add("LIMIT ?")
+        .add("OFFSET ?");
+
+
+        try (var conn = dataSource.getConnection();
+             var stmt = conn.prepareStatement(queryJoiner.toString()))
+        {
+            stmt.setString(1, filter);
+            stmt.setInt(2, count + 1);
+            stmt.setInt(3, count * page);
+
+            try (var rs = stmt.executeQuery()) {
+                while (rs.next()) {
+                    if (ret.size() == count) {
+                        hasMore = true;
+                        break;
+                    }
+                    ret.add(new DomainModel(
+                            rs.getInt("ID"),
+                            rs.getString("DOMAIN_NAME"),
+                            rs.getString("IP"),
+                            rs.getInt("NODE_AFFINITY"),
+                            Math.round(100 * rs.getDouble("RANK"))/100.,
+                            rs.getBoolean("BLACKLISTED")
+                            ));
+                }
+            }
+        }
+
+        List<Integer> nodes = new ArrayList<>();
+
+        for (var node : nodeConfigurationService.getAll()) {
+            nodes.add(node.node());
+        }
+
+        return new DomainSearchResultModel(filterRaw,
+                affinity,
+                field,
+                selectedAffinity,
+                selectedField,
+                page,
+                hasMore,
+                page > 0,
+                nodes,
+                ret);
+    }
+
+}
--- a/code/services-core/control-service/resources/templates/control/app/domains-new-report.hdb
+++ b/code/services-core/control-service/resources/templates/control/app/domains-new-report.hdb
@ -0,0 +1,41 @@
+<!DOCTYPE html>
+<html lang="en-US">
+<head>
+    <title>Control Service</title>
+    {{> control/partials/head-includes }}
+</head>
+<body>
+{{> control/partials/nav}}
+<div class="container">
+    <h1 class="my-3">Add Domains Report</h1>
+
+    <p></p>
+    {{#if error}}
+        <p class="alert alert-danger">{{error}}</p>
+    {{/if}}
+    {{#unless errror}}
+        {{#unless invalidDomains}}
+            <p>All domains were added successfully!</p>
+        {{/unless}}
+    {{/unless}}
+    {{#if invalidDomains}}
+        <p>Some domains were invalid and could not be added:</p>
+<textarea class="form-control" rows="10" disabled>
+{{#each invalidDomains}}
+{{.}}
+{{/each}}
+</textarea>
+    {{/if}}
+    {{#if validDomains}}
+        <p>If they were not already in the database, these domains were added:</p>
+<textarea class="form-control" rows="10" disabled>
+{{#each validDomains}}
+{{.}}
+{{/each}}
+</textarea>
+    {{/if}}
+    <p></p>
+</div>
+</body>
+{{> control/partials/foot-includes }}
+</html>
--- a/code/services-core/control-service/resources/templates/control/app/domains-new-url.hdb
+++ b/code/services-core/control-service/resources/templates/control/app/domains-new-url.hdb
@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html lang="en-US">
+<head>
+    <title>Control Service</title>
+    {{> control/partials/head-includes }}
+</head>
+<body>
+{{> control/partials/nav}}
+<div class="container">
+    <h1 class="my-3">Add Domains (URL)</h1>
+
+    <div class="my-3 p-3 border bg-light">
+        <p>This utility lets you add domains to be crawled via an external URL.</p>
+        <a href="/domain/new">It's also possible to add domains directly via a text area</a>
+    </div>
+
+    <form method="post">
+        <div class="form-group my-3">
+            <label for="url" class="form-label">Domains to add</label>
+            <input type="text" class="form-control" name="url"/>
+            <span class="text-muted">
+            Enter the URL to the file or page that contains the domains to add.   If the URL leads to a text file,
+            the domains will be parsed from the file, one per line.  If it leads to a HTML page, the HTML
+            will be parsed and all the links will be extracted and added as domains.
+        </span>
+        </div>
+
+        <div class="form-group my-3">
+            <label for="node" class="form-label">Node</label>
+            <select name="node" class="form-select">
+                <option value="-1">Unassigned</option>
+                <option value="0" selected>Auto</option>
+                {{#each global-context.nodes}}
+                    <option value="{{id}}">Node {{id}}</option>
+                {{/each}}
+
+            </select>
+            <span class="text-muted">
+                Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
+                and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
+            </span>
+        </div>
+        <button type="submit" class="btn btn-primary">Add</button>
+    </form>
+</div>
+</body>
+{{> control/partials/foot-includes }}
+</html>
--- a/code/services-core/control-service/resources/templates/control/app/domains-new.hdb
+++ b/code/services-core/control-service/resources/templates/control/app/domains-new.hdb
@ -0,0 +1,47 @@
+<!DOCTYPE html>
+<html lang="en-US">
+<head>
+    <title>Control Service</title>
+    {{> control/partials/head-includes }}
+</head>
+<body>
+{{> control/partials/nav}}
+<div class="container">
+    <h1 class="my-3">Add Domains</h1>
+
+    <div class="my-3 p-3 border bg-light">
+        <p>This utility lets you add domains to be crawled via a text area.</p>
+        <a href="/domain/new-url">It's also possible to add domains via an external URL</a>
+    </div>
+
+    <form method="post">
+        <div class="form-group my-3">
+            <label for="domains" class="form-label">Domains to add</label>
+            <textarea name="domains" class="form-control" rows="10"></textarea>
+            <span class="text-muted">
+            Enter a list of domains to add, one per line. The system will check if the domain is already in the database and
+            will not add duplicates.  Spaces and empty lines are ignored.
+        </span>
+        </div>
+
+        <div class="form-group my-3">
+            <label for="node" class="form-label">Node</label>
+            <select name="node" class="form-select">
+                <option value="-1">Unassigned</option>
+                <option value="0" selected>Auto</option>
+                {{#each global-context.nodes}}
+                    <option value="{{id}}">Node {{id}}</option>
+                {{/each}}
+
+            </select>
+            <span class="text-muted">
+                Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
+                and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
+            </span>
+        </div>
+        <button type="submit" class="btn btn-primary">Add</button>
+    </form>
+</div>
+</body>
+{{> control/partials/foot-includes }}
+</html>
--- a/code/services-core/control-service/resources/templates/control/app/domains.hdb
+++ b/code/services-core/control-service/resources/templates/control/app/domains.hdb
@ -0,0 +1,109 @@
+<!DOCTYPE html>
+<html lang="en-US">
+<head>
+    <title>Control Service</title>
+    {{> control/partials/head-includes }}
+</head>
+<body>
+{{> control/partials/nav}}
+<div class="container">
+    <h1 class="my-3">Domains</h1>
+
+    <table class="table">
+        <form method="get">
+            <tr>
+                <td>
+                    <select name="field" class="form-select" aria-label="Select Field">
+                        <option value="domain" {{#if selectedField.domain}}selected{{/if}}>Domain Name</option>
+                        <option value="id" {{#if selectedField.id}}selected{{/if}}>Domain ID</option>
+                        <option value="ip" {{#if selectedField.ip}}selected{{/if}}>IP</option>
+                    </select>
+                </td>
+                <td colspan="3"><input type="text" name="filter" class="form-control" placeholder="Domain" value="{{query}}"></td>
+                <td>
+                    <select name="affinity" class="form-select" aria-label="Select Node Affinity">
+                        <option value="all" {{#if selectedAffinity.all}}selected{{/if}}>-</option>
+                        <option value="unassigned" {{#if selectedAffinity.unassigned}}selected{{/if}}>Unassigned</option>
+                        <option value="scheduled" {{#if selectedAffinity.scheduled}}selected{{/if}}>Scheduled</option>
+                        <option value="assigned" {{#if selectedAffinity.assigned}}selected{{/if}}>Assigned</option>
+                    </select>
+                </td>
+                <td><button type="submit" class="btn btn-primary">Search</button></td>
+            </tr>
+        </form>
+        <tr>
+            <th>Domain</th>
+            <th>ID</th>
+            <th title="Which, if any, index node owns a domain and will crawl and index it">Node Affinity</th>
+            <th>Rank</th>
+            <th>IP</th>
+            <th>Blacklisted</th>
+        </tr>
+        {{#each results}}
+            <tr>
+                <td>{{name}}</td>
+                <td>{{id}}</td>
+                <td title="{{affinityState.desc}}">{{#unless unassigned}}{{affinityState}} {{#if nodeAffinity}}{{nodeAffinity}}{{/if}} {{/unless}}
+                    {{#if unassigned}}
+                        <div class="dropdown">
+                            <button title="Assign to a node" class="btn btn-secondary dropdown-toggle" type="button" id="dropdownMenuButton1" data-bs-toggle="dropdown" aria-expanded="false">
+                                Unassigned
+                            </button>
+                            <ul class="dropdown-menu" aria-labelledby="dropdownMenuButton1">
+                                <form method="post">
+                                    <input type="hidden" name="node" value="0">
+                                    <li>
+                                        <button
+                                            class="dropdown-item"
+                                            title="Assign to the next node that performs a crawl"
+                                            formaction="/domain/{{id}}/assign/0"
+                                            type="submit">
+                                            Any
+                                        </button>
+                                    </li>
+
+                                {{#each nodes}}
+                                    <input type="hidden" name="node" value="{{.}}">
+                                    <li>
+                                        <button
+                                                class="dropdown-item"
+                                                title="Assign to node {{.}}"
+                                                formaction="/domain/{{id}}/assign/{{.}}"
+                                                type="submit">
+                                            Node {{.}}
+                                        </button>
+                                    </li>
+                                {{/each}}
+                                </form>
+                            </ul>
+                        </div>
+                    {{/if}}
+                </td>
+                <td>{{rank}}</td>
+                <td>{{ip}}</td>
+                <td>{{#if blacklisted}}&check;{{/if}}</td>
+            </tr>
+        {{/each}}
+        {{#unless results}}
+            <tr>
+                <td colspan="5">No results found</td>
+            </tr>
+        {{/unless}}
+        <tr>
+            <td>
+                {{#if hasPrevious}}
+                    <a href="?page={{previousPage}}&filter={{query}}&field={{field}}&affinity={{affinity}}">Previous</a>
+                {{/if}}
+            </td>
+            <td colspan="4"></td>
+            <td>
+                {{#if hasNext}}
+                    <a href="?page={{nextPage}}&filter={{query}}&field={{field}}&affinity={{affinity}}">Next</a>
+                {{/if}}
+            </td>
+        </tr>
+    </table>
+</div>
+</body>
+{{> control/partials/foot-includes }}
+</html>
--- a/code/services-core/control-service/resources/templates/control/partials/foot-includes.hdb
+++ b/code/services-core/control-service/resources/templates/control/partials/foot-includes.hdb
@ -1,5 +1,4 @@
-<script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.8/dist/umd/popper.min.js" integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r" crossorigin="anonymous"></script>
-<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.min.js" integrity="sha384-BBtl+eGJRgqQAUMxJ7pMwbEyER4l1g+O15P+16Ep7Q9Q+zqX6gSbd85u4mG4QzX+" crossorigin="anonymous"></script>
+<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js" integrity="sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM" crossorigin="anonymous"></script>
 <script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
 <script src="/refresh.js"></script>
 <script type="javascript">
--- a/code/services-core/control-service/resources/templates/control/partials/nav.hdb
+++ b/code/services-core/control-service/resources/templates/control/partials/nav.hdb
@ -16,13 +16,21 @@
 					<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button"  aria-expanded="false">Application</a>
 					<ul class="dropdown-menu">
 						<li><a class="dropdown-item" href="/api-keys" title="Create or remove API keys">API Keys</a></li>
-						<li><a class="dropdown-item" href="/blacklist" title="Add or remove website sanctions">Blacklist</a></li>
-						<li><a class="dropdown-item" href="/search-to-ban" title="Search function for easy blacklisting">Blacklist Search</a></li>
 						<li><a class="dropdown-item" href="/complaints" title="View and act on user complaints">Complaints</a></li>
 						<li><a class="dropdown-item" href="/review-random-domains" title="Review random domains list">Random Exploration</a></li>
 					</ul>
 				</li>
                {{/unless}}
+                <li class="nav-item dropdown">
+                    <a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button"  aria-expanded="false">Domains</a>
+                    <ul class="dropdown-menu">
+                        <li><a class="dropdown-item" href="/domain/new" title="Add New Domains">Add Domains</a></li>
+                        <li><a class="dropdown-item" href="/domain" title="List Domains">Manage Domains</a></li>
+                        <li><hr class="dropdown-divider"></li>
+                        <li><a class="dropdown-item" href="/blacklist" title="Add or remove website sanctions">Blacklist</a></li>
+                        <li><a class="dropdown-item" href="/search-to-ban" title="Search function for easy blacklisting">Blacklist Search</a></li>
+                    </ul>
+                </li>
                <li class="nav-item dropdown">
                    <a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button"  aria-expanded="false">Index Nodes</a>
                    <ul class="dropdown-menu">
--- a/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java
+++ b/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java
@ -16,6 +16,7 @@ import java.net.http.HttpClient;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
 import java.sql.Connection;
+import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.time.Duration;
 import java.util.ArrayList;
@ -34,7 +35,7 @@ public class ScreenshotCaptureToolMain {

        System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");

-        List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000);
+        List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 10_000);

        HttpClient httpClient = HttpClient.newBuilder()
                .version(HttpClient.Version.HTTP_1_1)
@ -137,16 +138,33 @@ public class ScreenshotCaptureToolMain {
        List<EdgeDomain> ret = new ArrayList<>(queueSize);

        try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
-            var rsp = stmt.executeQuery(
+            int newCount = queueSize / 4;
+            int oldCount = queueSize - newCount;
+
+            ResultSet rst = stmt.executeQuery(
                    """
                    SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
                    LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
                    ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
                    LIMIT
-                    """ + queueSize);
-            while (rsp.next()) {
-                ret.add(new EdgeDomain(rsp.getString(1)));
+                    """ + newCount);
+            while (rst.next()) {
+                ret.add(new EdgeDomain(rst.getString(1)));
            }
+
+            rst = stmt.executeQuery("""
+                SELECT DATA_DOMAIN_HISTORY.DOMAIN_NAME FROM DATA_DOMAIN_HISTORY
+                INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME = DATA_DOMAIN_HISTORY.DOMAIN_NAME
+                WHERE SCREENSHOT_DATE IS NOT NULL
+                ORDER BY SCREENSHOT_DATE ASC
+                LIMIT
+                """ + oldCount);
+
+            while (rst.next()) {
+                ret.add(new EdgeDomain(rst.getString(1)));
+            }
+
+
        }
        catch (Exception ex) {
            logger.warn("Exception in fetching queue", ex);