diff --git a/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java b/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java index b037702d..25610892 100644 --- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java +++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java @@ -44,6 +44,15 @@ public class ExecutorCrawlClient { .build()); } + public void triggerRecrawlSingleDomain(int node, FileStorageId fid, String domainName) { + channelPool.call(ExecutorCrawlApiBlockingStub::triggerSingleDomainRecrawl) + .forNode(node) + .run(RpcFileStorageIdWithDomainName.newBuilder() + .setFileStorageId(fid.id()) + .setTargetDomainName(domainName) + .build()); + } + public void triggerConvert(int node, FileStorageId fid) { channelPool.call(ExecutorCrawlApiBlockingStub::triggerConvert) .forNode(node) diff --git a/code/execution/api/src/main/protobuf/executor-api.proto b/code/execution/api/src/main/protobuf/executor-api.proto index 565770ac..2858d60b 100644 --- a/code/execution/api/src/main/protobuf/executor-api.proto +++ b/code/execution/api/src/main/protobuf/executor-api.proto @@ -22,6 +22,7 @@ service ExecutorApi { service ExecutorCrawlApi { rpc triggerCrawl(RpcFileStorageId) returns (Empty) {} rpc triggerRecrawl(RpcFileStorageId) returns (Empty) {} + rpc triggerSingleDomainRecrawl(RpcFileStorageIdWithDomainName) returns (Empty) {} rpc triggerConvert(RpcFileStorageId) returns (Empty) {} rpc triggerConvertAndLoad(RpcFileStorageId) returns (Empty) {} rpc loadProcessedData(RpcFileStorageIds) returns (Empty) {} @@ -55,6 +56,10 @@ message RpcProcessId { message RpcFileStorageId { int64 fileStorageId = 1; } +message RpcFileStorageIdWithDomainName { + int64 fileStorageId = 1; + string targetDomainName = 2; +} message RpcFileStorageIds { repeated int64 fileStorageIds = 1; } diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActor.java b/code/execution/java/nu/marginalia/actor/ExecutorActor.java index d04b3eaa..e59ecd9c 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActor.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActor.java @@ -3,6 +3,7 @@ package nu.marginalia.actor; public enum ExecutorActor { CRAWL, RECRAWL, + RECRAWL_SINGLE_DOMAIN, CONVERT_AND_LOAD, PROC_CONVERTER_SPAWNER, PROC_LOADER_SPAWNER, diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java index 6f37d7ab..591119f8 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -26,6 +26,7 @@ public class ExecutorActorControlService { private final ExecutorActorStateMachines stateMachines; public Map actorDefinitions = new HashMap<>(); private final int node; + @Inject public ExecutorActorControlService(MessageQueueFactory messageQueueFactory, BaseServiceParams baseServiceParams, @@ -33,6 +34,7 @@ public class ExecutorActorControlService { ConvertAndLoadActor convertAndLoadActor, CrawlActor crawlActor, RecrawlActor recrawlActor, + RecrawlSingleDomainActor recrawlSingleDomainActor, RestoreBackupActor restoreBackupActor, ConverterMonitorActor converterMonitorFSM, CrawlerMonitorActor crawlerMonitorActor, @@ -57,6 +59,8 @@ public class ExecutorActorControlService { register(ExecutorActor.CRAWL, crawlActor); register(ExecutorActor.RECRAWL, recrawlActor); + register(ExecutorActor.RECRAWL_SINGLE_DOMAIN, recrawlSingleDomainActor); + register(ExecutorActor.CONVERT, convertActor); register(ExecutorActor.RESTORE_BACKUP, restoreBackupActor); register(ExecutorActor.CONVERT_AND_LOAD, convertAndLoadActor); diff --git a/code/execution/java/nu/marginalia/actor/task/CrawlActor.java b/code/execution/java/nu/marginalia/actor/task/CrawlActor.java index 3e097554..0a742888 100644 --- a/code/execution/java/nu/marginalia/actor/task/CrawlActor.java +++ b/code/execution/java/nu/marginalia/actor/task/CrawlActor.java @@ -50,7 +50,9 @@ public class CrawlActor extends RecordActorPrototype { storageService.relateFileStorages(storage.id(), dataArea.id()); // Send convert request - long msgId = mqCrawlerOutbox.sendAsync(new CrawlRequest(List.of(fid), dataArea.id())); + long msgId = mqCrawlerOutbox.sendAsync( + CrawlRequest.forSpec(fid, dataArea.id()) + ); yield new Crawl(msgId); } diff --git a/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java b/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java index 2b748ced..0eefd4ef 100644 --- a/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java +++ b/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java @@ -59,7 +59,7 @@ public class RecrawlActor extends RecordActorPrototype { refreshService.synchronizeDomainList(); - long id = mqCrawlerOutbox.sendAsync(new CrawlRequest(null, fid)); + long id = mqCrawlerOutbox.sendAsync(CrawlRequest.forRecrawl(fid)); yield new Crawl(id, fid, cascadeLoad); } diff --git a/code/execution/java/nu/marginalia/actor/task/RecrawlSingleDomainActor.java b/code/execution/java/nu/marginalia/actor/task/RecrawlSingleDomainActor.java new file mode 100644 index 00000000..990da5aa --- /dev/null +++ b/code/execution/java/nu/marginalia/actor/task/RecrawlSingleDomainActor.java @@ -0,0 +1,85 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorResumeBehavior; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.actor.state.Resume; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.crawling.CrawlRequest; +import nu.marginalia.process.ProcessOutboxes; +import nu.marginalia.process.ProcessService; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.storage.model.FileStorageType; + +@Singleton +public class RecrawlSingleDomainActor extends RecordActorPrototype { + + private final MqOutbox mqCrawlerOutbox; + private final FileStorageService storageService; + private final ActorProcessWatcher processWatcher; + + /** Initial step + * @param storageId - the id of the storage to recrawl + * @param targetDomainName - domain to be recrawled + */ + public record Initial(FileStorageId storageId, String targetDomainName) implements ActorStep {} + + /** The action step */ + @Resume(behavior = ActorResumeBehavior.RETRY) + public record Crawl(long messageId) implements ActorStep {} + + @Override + public ActorStep transition(ActorStep self) throws Exception { + return switch (self) { + case Initial (FileStorageId fid, String targetDomainName) -> { + var crawlStorage = storageService.getStorage(fid); + + if (crawlStorage == null) yield new Error("Bad storage id"); + if (crawlStorage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + crawlStorage.type()); + + long id = mqCrawlerOutbox.sendAsync( + CrawlRequest.forSingleDomain(targetDomainName, fid) + ); + + yield new Crawl(id); + } + case Crawl (long msgId) -> { + var rsp = processWatcher.waitResponse( + mqCrawlerOutbox, + ProcessService.ProcessId.CRAWLER, + msgId); + + if (rsp.state() != MqMessageState.OK) { + yield new Error("Crawler failed"); + } + + yield new End(); + } + default -> new End(); + }; + } + + @Override + public String describe() { + return "Run the crawler only re-fetching a single domain"; + } + + @Inject + public RecrawlSingleDomainActor(ActorProcessWatcher processWatcher, + ProcessOutboxes processOutboxes, + FileStorageService storageService, + Gson gson) + { + super(gson); + + this.processWatcher = processWatcher; + this.mqCrawlerOutbox = processOutboxes.getCrawlerOutbox(); + this.storageService = storageService; + } + +} diff --git a/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java b/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java index b95f64d0..20648015 100644 --- a/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java +++ b/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java @@ -47,6 +47,22 @@ public class ExecutorCrawlGrpcService extends ExecutorCrawlApiGrpc.ExecutorCrawl } } + @Override + public void triggerSingleDomainRecrawl(RpcFileStorageIdWithDomainName request, StreamObserver responseObserver) { + try { + actorControlService.startFrom(ExecutorActor.RECRAWL_SINGLE_DOMAIN, + new RecrawlSingleDomainActor.Initial( + FileStorageId.of(request.getFileStorageId()), + request.getTargetDomainName())); + + responseObserver.onNext(Empty.getDefaultInstance()); + responseObserver.onCompleted(); + } + catch (Exception e) { + responseObserver.onError(e); + } + } + @Override public void triggerConvert(RpcFileStorageId request, StreamObserver responseObserver) { try { diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java index bd537c6e..7ff09289 100644 --- a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java +++ b/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java @@ -6,6 +6,7 @@ import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.file.Files; import java.nio.file.Path; import java.sql.Connection; import java.sql.DriverManager; @@ -24,6 +25,10 @@ public class AnchorTagsImpl implements AnchorTagsSource { logger.info("Loading atags from " + atagsPath); + if (!Files.exists(atagsPath)) { + throw new IllegalArgumentException("atags file does not exist: " + atagsPath); + } + try (var stmt = duckdbConnection.createStatement()) { // Insert the domains into a temporary table, then use that to filter the atags table @@ -35,13 +40,18 @@ public class AnchorTagsImpl implements AnchorTagsSource { } } - // Project the atags table down to only the relevant domains. This looks like an SQL injection - // vulnerability if you're a validation tool, but the string comes from a trusted source. + // This is a SQL injection vulnerability if you're a validation tool, but the string comes from a trusted source + // -- we validate nonetheless to present a better error message + String path = atagsPath.toAbsolutePath().toString(); + if (path.contains("'")) { + throw new IllegalArgumentException("atags file path contains a single quote: " + path + " and would break the query."); + } + stmt.executeUpdate(""" create table atags as select * from '%s' where dest in (select * from domains) - """.formatted(atagsPath.toAbsolutePath())); + """.formatted(path)); // Free up the memory used by the domains table stmt.executeUpdate("drop table domains"); diff --git a/code/libraries/array/cpp/.gitignore b/code/libraries/array/cpp/.gitignore new file mode 100644 index 00000000..a52549f5 --- /dev/null +++ b/code/libraries/array/cpp/.gitignore @@ -0,0 +1 @@ +resources/libcpp.so diff --git a/code/libraries/array/cpp/compile.sh b/code/libraries/array/cpp/compile.sh old mode 100644 new mode 100755 index 89c6d1d6..47713569 --- a/code/libraries/array/cpp/compile.sh +++ b/code/libraries/array/cpp/compile.sh @@ -7,4 +7,4 @@ if ! which ${CXX} > /dev/null; then exit 0 fi -${CXX} -O3 -march=native -shared -Isrc/main/public src/main/cpp/*.cpp -o resources/libcpp.so \ No newline at end of file +${CXX} -O3 -march=native -std=c++14 -shared -Isrc/main/public src/main/cpp/*.cpp -o resources/libcpp.so diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java b/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java index 40cd30ce..ff090140 100644 --- a/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java +++ b/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java @@ -14,8 +14,24 @@ public class CrawlRequest { */ public List specStorage; + /** (optional) Name of a single domain to be re-crawled */ + public String targetDomainName; + /** File storage where the crawl data will be written. If it contains existing crawl data, * this crawl data will be referenced for e-tags and last-mofified checks. */ public FileStorageId crawlStorage; + + public static CrawlRequest forSpec(FileStorageId specStorage, FileStorageId crawlStorage) { + return new CrawlRequest(List.of(specStorage), null, crawlStorage); + } + + public static CrawlRequest forSingleDomain(String targetDomainName, FileStorageId crawlStorage) { + return new CrawlRequest(null, targetDomainName, crawlStorage); + } + + public static CrawlRequest forRecrawl(FileStorageId crawlStorage) { + return new CrawlRequest(null, null, crawlStorage); + } + } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index 1b04c0f9..5173af75 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -23,6 +23,7 @@ import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawlerOutputFile; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.crawlspec.CrawlSpecFileNames; +import nu.marginalia.model.EdgeDomain; import nu.marginalia.service.ProcessMainClass; import nu.marginalia.storage.FileStorageService; import nu.marginalia.model.crawlspec.CrawlSpecRecord; @@ -136,7 +137,12 @@ public class CrawlerMain extends ProcessMainClass { var instructions = crawler.fetchInstructions(); try { - crawler.run(instructions.specProvider, instructions.outputDir); + if (instructions.targetDomainName != null) { + crawler.runForSingleDomain(instructions.targetDomainName, instructions.outputDir); + } + else { + crawler.run(instructions.specProvider, instructions.outputDir); + } instructions.ok(); } catch (Exception ex) { logger.error("Crawler failed", ex); @@ -200,6 +206,26 @@ public class CrawlerMain extends ProcessMainClass { } } + public void runForSingleDomain(String targetDomainName, Path outputDir) throws Exception { + + heartbeat.start(); + + try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log")); + WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); + AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName))) + ) { + var spec = new CrawlSpecRecord(targetDomainName, 1000, null); + var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog); + task.run(); + } + catch (Exception ex) { + logger.warn("Exception in crawler", ex); + } + finally { + heartbeat.shutDown(); + } + } + class CrawlTask implements SimpleBlockingThreadPool.Task { private final CrawlSpecRecord specification; @@ -216,7 +242,8 @@ public class CrawlerMain extends ProcessMainClass { AnchorTagsSource anchorTagsSource, Path outputDir, WarcArchiverIf warcArchiver, - WorkLog workLog) { + WorkLog workLog) + { this.specification = specification; this.anchorTagsSource = anchorTagsSource; this.outputDir = outputDir; @@ -303,11 +330,19 @@ public class CrawlerMain extends ProcessMainClass { private final MqMessage message; private final MqSingleShotInbox inbox; - CrawlRequest(CrawlSpecProvider specProvider, Path outputDir, MqMessage message, MqSingleShotInbox inbox) { + private final String targetDomainName; + + CrawlRequest(CrawlSpecProvider specProvider, + String targetDomainName, + Path outputDir, + MqMessage message, + MqSingleShotInbox inbox) + { this.message = message; this.inbox = inbox; this.specProvider = specProvider; this.outputDir = outputDir; + this.targetDomainName = targetDomainName; } @@ -325,6 +360,7 @@ public class CrawlerMain extends ProcessMainClass { var inbox = messageQueueFactory.createSingleShotInbox(CRAWLER_INBOX, node, UUID.randomUUID()); logger.info("Waiting for instructions"); + var msgOpt = getMessage(inbox, nu.marginalia.mqapi.crawling.CrawlRequest.class.getSimpleName()); var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); @@ -350,6 +386,7 @@ public class CrawlerMain extends ProcessMainClass { return new CrawlRequest( specProvider, + request.targetDomainName, crawlData.asPath(), msg, inbox); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 7980f3a7..1df0301b 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -183,6 +183,8 @@ public class HttpFetcherImpl implements HttpFetcher { getBuilder.url(url.toString()) .addHeader("Accept-Encoding", "gzip") + .addHeader("Accept-Language", "en,*;q=0.5") + .addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8") .addHeader("User-agent", userAgentString); contentTags.paint(getBuilder); @@ -225,6 +227,7 @@ public class HttpFetcherImpl implements HttpFetcher { getBuilder.url(url.toString()) .addHeader("Accept-Encoding", "gzip") + .addHeader("Accept", "text/*, */*;q=0.9") .addHeader("User-agent", userAgentString); HttpFetchResult result = recorder.fetch(client, getBuilder.build()); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java index b6b8a589..4833c6e7 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java @@ -32,7 +32,7 @@ public class NoSecuritySSL { @SneakyThrows public static SSLSocketFactory buildSocketFactory() { // Install the all-trusting trust manager - final SSLContext sslContext = SSLContext.getInstance("SSL"); + final SSLContext sslContext = SSLContext.getInstance("TLS"); sslContext.init(null, trustAllCerts, new java.security.SecureRandom()); var clientSessionContext = sslContext.getClientSessionContext(); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java index b3a8ab4f..79b4c86a 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java @@ -8,7 +8,7 @@ import java.security.NoSuchAlgorithmException; class WarcDigestBuilder { private final MessageDigest digest; - private static final String digestAlgorithm = "SHA-1"; + private static final String digestAlgorithm = "SHA-256"; public WarcDigestBuilder() throws NoSuchAlgorithmException { this.digest = MessageDigest.getInstance(digestAlgorithm); diff --git a/code/services-application/search-service/resources/static/search/main.js b/code/services-application/search-service/resources/static/search/main.js index 618533b7..a6bd3157 100644 --- a/code/services-application/search-service/resources/static/search/main.js +++ b/code/services-application/search-service/resources/static/search/main.js @@ -1,6 +1,6 @@ -// This sets the data-has-js attribute on the body tag to true, so we can style the page with the assumption that +// This sets the data-has-js attribute on the html tag to true, so we can style the page with the assumption that // the browser supports JS. This is a progressive enhancement, so the page will still work without JS. -document.getElementsByTagName('body')[0].setAttribute('data-has-js', 'true'); +document.documentElement.setAttribute('data-has-js', 'true'); // To prevent the filter menu from being opened when the user hits enter on the search box, we need to add a keydown // handler to the search box that stops the event from propagating. Janky hack, but it works. diff --git a/code/services-application/search-service/resources/static/search/serp.scss b/code/services-application/search-service/resources/static/search/serp.scss index 089c3884..3e25e780 100644 --- a/code/services-application/search-service/resources/static/search/serp.scss +++ b/code/services-application/search-service/resources/static/search/serp.scss @@ -1,33 +1,102 @@ -$nicotine-dark: #acae89; -$nicotine-light: #f8f8ee; -$fg-dark: #000; -$fg-light: #fff; -$highlight-dark: #2f4858; -$highlight-light: #3F5F6F; -$highlight-light2: #eee; -$border-color: #ccc; -$border-color2: #aaa; -$heading-fonts: serif; -$visited: #fcc; +:root { + color-scheme: light; + + --clr-bg-page: hsl(60, 42%, 95%); // $nicotine-light + + --clr-bg-ui: hsl(0, 0%, 100%); + --clr-text-ui: #000; // $fg-dark + + --clr-bg-theme: hsl(200, 28%, 34%); // $highlight-light + --clr-text-theme: #fff; // $fg-light + + --clr-bg-highlight: hsl(0, 0%, 93%); // $highlight-light2 + --clr-text-highlight: #111111; + + --clr-bg-accent: hsl(63, 19%, 61%); // $nicotine-dark + --clr-border-accent: hsl(63, 19%, 35%); + + --clr-border: #aaa; // $border-color2 + + --clr-shadow: var(--clr-border); + + --clr-link: #0066cc; + --clr-link-visited: #531a89; + --clr-heading-link-visited: #fcc; // $visited + + --font-family: sans-serif; + --font-size: 14px; + --font-family-heading: serif; // $heading-fonts +} + + +@mixin dark-theme-mixin { + color-scheme: dark; + + --clr-bg-page: hsl(0, 0%, 6%); + + --clr-bg-ui: hsl(0, 0%, 18%); + --clr-text-ui: #ddd; + + --clr-bg-theme: hsl(0, 0%, 2%); + --clr-text-theme: var(--clr-text-ui); + + --clr-bg-highlight: hsl(0, 0%, 11%); + --clr-text-highlight: #fff; + + --clr-bg-accent: hsl(200, 32%, 28%); + --clr-border-accent: hsl(200, 8%, 12%); + + --clr-border: hsl(0, 0%, 30%); + + --clr-shadow: #000; + + --clr-link: #8a8aff; + --clr-link-visited: #ffadff; + --clr-heading-link-visited: var(--clr-link-visited); +} + +:root[data-theme='dark'] { + @include dark-theme-mixin; +} + +// Makes theme match the user's OS preference when JS is disabled +@media (prefers-color-scheme: dark) { + :root:not([data-has-js="true"]) { + @include dark-theme-mixin; + } +} * { box-sizing: border-box; } + +a { + color: var(--clr-link); +} + +a:visited { + color: var(--clr-link-visited); +} + +input, textarea, select { + color: inherit; +} + h1 a, h2 a { - color: $fg-light; + color: var(--clr-text-theme); } h1 a:visited, h2 a:visited { - color: $visited; + color: var(--clr-heading-link-visited); } progress { width: 10ch; } body { - background-color: $nicotine-light; - color: $fg-dark; - font-family: sans-serif; - font-size: 14px; + background-color: var(--clr-bg-page); + color: var(--clr-text-ui); + font-family: var(--font-family); + font-size: var(--font-size); line-height: 1.6; margin-left: auto; margin-right: auto; @@ -99,28 +168,28 @@ body { li { display: inline; padding: 1ch; - background-color: $highlight-light2; + background-color: var(--clr-bg-highlight); a { text-decoration: none; display: inline-block; - color: #000; + color: var(--clr-text-highlight); } } li.current { - background-color: $highlight-light; + background-color: var(--clr-bg-theme); a { - color: #fff; + color: var(--clr-text-theme); } } } } .dialog { - border: 1px solid $border-color2; - box-shadow: 0 0 1ch $border-color; - background-color: #fff; + border: 1px solid var(--clr-border); + box-shadow: 0 0 1ch var(--clr-shadow); + background-color: var(--clr-bg-ui); padding: 1ch; h2 { @@ -129,43 +198,58 @@ body { font-weight: normal; padding: 0.5ch; font-size: 12pt; - background-color: $highlight-light; - color: #fff; + background-color: var(--clr-bg-theme); + color: var(--clr-text-theme); } } header { - background-color: $nicotine-dark; - color: #fff; - border: 1px solid #888; - box-shadow: 0 0 0.5ch #888; + background-color: var(--clr-bg-accent); + border: 1px solid var(--clr-border-accent); + color: var(--clr-text-ui); + box-shadow: 0 0 0.5ch var(--clr-shadow); margin-bottom: 1ch; + display: flex; + align-items: center; + justify-content: space-between; nav { a { text-decoration: none; - color: #000; - + color: var(--clr-text-ui); padding: .5ch; display: inline-block; } + a:visited { + color: var(--clr-text-ui); + } + a.extra { background: #ccc linear-gradient(45deg, - rgba(255,100,100,1) 0%, - rgba(100,255,100,1) 50%, - rgba(100,100,255,1) 100%); + hsl(0, 100%, 70%) 0%, + hsl(120, 100%, 70%) 50%, + hsl(240, 100%, 70%) 100%); color: black; text-shadow: 0 0 0.5ch #fff; } a:hover, a:focus { - background: #2f4858; - color: #fff !important; + background: var(--clr-bg-theme); + color: var(--clr-text-theme); } } } +#theme { + padding: .5ch; + display: none; + + [data-has-js='true'] & { + display: block; + } +} + #complaint { @extend .dialog; max-width: 60ch; @@ -210,11 +294,11 @@ header { @extend .heading; } - background-color: #fff; + background-color: var(--clr-bg-ui); padding: 1ch; margin: 1ch; - border: 1px solid $border-color2; - box-shadow: 0 0 1ch $border-color; + border: 1px solid var(--clr-border); + box-shadow: 0 0 1ch var(--clr-shadow); } section.cards { @@ -226,11 +310,10 @@ section.cards { justify-content: flex-start; .card { - border: 2px #ccc; - background-color: #fff; + background-color: var(--clr-bg-ui); border-left: 1px solid #ecb; border-top: 1px solid #ecb; - box-shadow: #0008 0 0 5px; + box-shadow: var(--clr-shadow) 0 0 5px; h2 { @extend .heading; @@ -239,7 +322,7 @@ section.cards { h2 a { display: block !important; - color: #fff; + color: inherit; text-decoration: none; } a:focus img { @@ -271,12 +354,17 @@ section.cards { padding-right: 1ch; line-height: 1.6; } + + [data-theme='dark'] & { + border: 1px solid var(--clr-border); + } } } .positions { - box-shadow: 0 0 2px #888; - background-color: #e4e4e4; + box-shadow: 0 0 2px var(--clr-shadow); + backdrop-filter: brightness(90%); + color: var(--clr-text-highlight); padding: 2px; margin-right: -1ch; margin-left: 1ch; @@ -297,13 +385,13 @@ footer { h1 { font-weight: normal; - border-bottom: 4px solid $highlight-light; + border-bottom: 4px solid var(--clr-bg-theme); } h2 { font-size: 14pt; font-weight: normal; - border-bottom: 2px solid $highlight-dark; + border-bottom: 2px solid var(--clr-bg-theme); width: 80%; } @@ -312,9 +400,9 @@ footer { flex-basis: 40ch; flex-grow: 1.1; - background-color: #fff; - border-left: 1px solid $border-color2; - box-shadow: -1px -1px 5px $border-color; + background-color: var(--clr-bg-ui); + border-left: 1px solid var(--clr-border); + box-shadow: -1px -1px 5px var(--clr-shadow); padding-left: 1ch; padding-right: 1ch; @@ -329,18 +417,18 @@ footer { } .shadowbox { - box-shadow: 0 0 1ch $border-color2; - border: 1px solid $border-color; + box-shadow: 0 0 1ch var(--clr-shadow); + border: 1px solid var(--clr-border); } .heading { margin: 0; padding: 0.5ch; - background-color: $highlight-light; - border-bottom: 1px solid $border-color2; - font-family: $heading-fonts; + background-color: var(--clr-bg-theme); + border-bottom: 1px solid var(--clr-border); + font-family: var(--font-family-heading); font-weight: normal; - color: $fg-light; + color: var(--clr-text-theme); font-size: 12pt; word-break: break-word; } @@ -440,7 +528,7 @@ footer { @extend .shadowbox; padding: 0.5ch; - background-color: $fg-light; + background-color: var(--clr-bg-ui); display: grid; grid-template-columns: max-content 0 auto max-content; grid-gap: 0.5ch; @@ -452,12 +540,13 @@ footer { padding: 0.5ch; font-size: 14pt; word-break: keep-all; - background-color: $highlight-light; - color: $fg-light; - font-family: $heading-fonts; + background-color: var(--clr-bg-theme); + color: var(--clr-text-theme); + font-family: var(--font-family-heading); font-weight: normal; - border: 1px solid; text-align: center; + display: flex; + justify-content: space-between; } #suggestions-anchor { @@ -469,18 +558,18 @@ footer { font-family: monospace; font-size: 12pt; padding: 0.5ch; - border: 1px solid $border-color2; - background-color: $fg-light; - color: $fg-dark; + border: 1px solid var(--clr-border); + background-color: inherit; } input[type="submit"] { font-size: 12pt; - border: 1px solid $border-color2; - background-color: $fg-light; - color: $fg-dark; + border: 1px solid var(--clr-border); + background-color: var(--clr-bg-ui); + cursor: pointer; } + // white suggesitons looks fine in dark mode .suggestions { background-color: #fff; padding: .5ch; @@ -491,7 +580,7 @@ footer { width: 300px; border-left: 1px solid #ccc; border-top: 1px solid #ccc; - box-shadow: 5px 5px 5px #888; + box-shadow: 5px 5px 5px var(--clr-shadow); z-index: 10; a { @@ -528,22 +617,22 @@ footer { #filters { @extend .shadowbox; margin-top: 1ch; - background-color: $fg-light; + background-color: var(--clr-bg-ui); h2 { @extend .heading; - background-color: $highlight-light; + background-color: var(--clr-bg-theme); } h3 { @extend .heading; - background-color: $highlight-light2; + background-color: var(--clr-bg-highlight); + color: var(--clr-text-highlight); font-family: sans-serif; - color: #000; border-bottom: 1px solid #000; } hr { - border-top: 0.5px solid $border-color2; + border-top: 0.5px solid var(--clr-border); border-bottom: none; } ul { @@ -553,17 +642,17 @@ footer { li { padding: 1ch; a { - color: $fg-dark; + color: inherit; text-decoration: none; } a:hover, a:focus { - border-bottom: 1px solid $highlight-light; + border-bottom: 1px solid var(--clr-bg-theme); } } li.current { - border-left: 4px solid $highlight-light; - background-color: $highlight-light2; + border-left: 4px solid var(--clr-bg-theme); + background-color: var(--clr-bg-highlight); a { margin-left: -4px; } @@ -576,46 +665,46 @@ footer { margin: 1ch 0 2ch 0; .url { - background-color: $highlight-light; + background-color: var(--clr-bg-theme); padding-left: 0.5ch; a { word-break: break-all; font-family: monospace; font-size: 8pt; - color: $fg-light; + color: var(--clr-text-theme); text-shadow: 0 0 1ch #000; // guarantee decent contrast across background colors } a:visited { - color: $visited; + color: var(--clr-heading-link-visited); } } h2 { a { word-break: break-all; - color: $fg-dark; + color: var(--clr-text-ui); text-decoration: none; } font-size: 12pt; @extend .heading; - background-color: $highlight-light2; + background-color:var(--clr-bg-highlight); } .description { - background-color: $fg-light; + background-color: var(--clr-bg-ui); word-break: break-word; padding: 1ch; margin: 0; } ul.additional-results { - background-color: $fg-light; + background-color: var(--clr-bg-ui); padding: 1ch; list-style: none; margin: 0; a { - color: $fg-dark; + color: inherit; } } } @@ -631,7 +720,7 @@ footer { display: flex; font-size: 10pt; padding: 1ch; - background-color: #eee; + background-color: var(--clr-bg-highlight); > * { margin-right: 1ch; @@ -645,12 +734,12 @@ footer { padding-left: 4px; } a { - color: #000; + color: var(--clr-text-highlight); } } @media (max-device-width: 624px) { - body[data-has-js="true"] { // This property is set via js so we can selectively enable these changes only if JS is enabled; + [data-has-js="true"] body { // This property is set via js so we can selectively enable these changes only if JS is enabled; // This is desirable since mobile navigation is JS-driven. If JS is disabled, having a squished // GUI is better than having no working UI. margin: 0 !important; @@ -666,6 +755,8 @@ footer { #mcfeast { display: inline; float: right; + width: 2rem; + font-size: 1rem; } #menu-close { diff --git a/code/services-application/search-service/resources/static/search/theme.js b/code/services-application/search-service/resources/static/search/theme.js new file mode 100644 index 00000000..73fdcd26 --- /dev/null +++ b/code/services-application/search-service/resources/static/search/theme.js @@ -0,0 +1,57 @@ +function getTheme() { + const theme = window.localStorage.getItem('theme'); + + // if a valid theme is set in localStorage, return it + if (theme === 'dark' || theme === 'light') { + return { value: theme, system: false }; + } + + // if matchMedia is supported and OS theme is dark + if (window.matchMedia('(prefers-color-scheme: dark)').matches) { + return { value: 'dark', system: true }; + } + + return { value: 'light', system: true }; +} + +function setTheme(value) { + if (value === 'dark' || value === 'light') { + window.localStorage.setItem('theme', value); + } else { + window.localStorage.removeItem('theme'); + } + + const theme = getTheme(); + + document.documentElement.setAttribute('data-theme', theme.value); +} + +function initializeTheme() { + const themeSelect = document.getElementById('theme-select'); + + const theme = getTheme(); + + document.documentElement.setAttribute('data-theme', theme.value); + + // system is selected by default in the themeSwitcher so ignore it here + if (!theme.system) { + themeSelect.value = theme.value; + } + + themeSelect.addEventListener('change', e => { + setTheme(e.target.value); + }); + + const mql = window.matchMedia('(prefers-color-scheme: dark)'); + + // if someone changes their theme at the OS level we need to update + // their theme immediately if they're using their OS theme + mql.addEventListener('change', e => { + if (themeSelect.value !== 'system') return; + + if (e.matches) setTheme('dark'); + else setTheme('light'); + }); +} + +initializeTheme(); \ No newline at end of file diff --git a/code/services-application/search-service/resources/static/search/tts.js b/code/services-application/search-service/resources/static/search/tts.js index 3ad24f82..20ee9f37 100644 --- a/code/services-application/search-service/resources/static/search/tts.js +++ b/code/services-application/search-service/resources/static/search/tts.js @@ -27,7 +27,7 @@ function setupTypeahead() { for (i=0;iDonate Random +
+ + +
+ + + \ No newline at end of file diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java index a1ce22c2..31928bdb 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java @@ -86,7 +86,7 @@ public class ControlCrawlDataService { ORDER BY httpStatus """); while (rs.next()) { - final boolean isCurrentFilter = selectedContentType.equals(rs.getString("httpStatus")); + final boolean isCurrentFilter = selectedHttpStatus.equals(rs.getString("httpStatus")); final int status = rs.getInt("httpStatus"); final int cnt = rs.getInt("cnt"); diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java index 4b833789..c385e52e 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java @@ -24,6 +24,7 @@ import java.nio.file.Path; import java.sql.SQLException; import java.util.Arrays; import java.util.List; +import java.util.Objects; import java.util.Set; @Singleton @@ -88,6 +89,9 @@ public class ControlNodeActionsService { Spark.post("/nodes/:id/actions/recrawl", this::triggerAutoRecrawl, redirectControl.renderRedirectAcknowledgement("Recrawling", "..") ); + Spark.post("/nodes/:id/actions/recrawl-single-domain", this::triggerSingleDomainRecrawl, + redirectControl.renderRedirectAcknowledgement("Recrawling", "..") + ); Spark.post("/nodes/:id/actions/process", this::triggerProcess, redirectControl.renderRedirectAcknowledgement("Processing", "..") ); @@ -216,6 +220,21 @@ public class ControlNodeActionsService { return ""; } + private Object triggerSingleDomainRecrawl(Request request, Response response) throws SQLException { + int nodeId = Integer.parseInt(request.params("id")); + + var toCrawl = parseSourceFileStorageId(request.queryParams("source")); + var targetDomainName = Objects.requireNonNull(request.queryParams("targetDomainName")); + + crawlClient.triggerRecrawlSingleDomain( + nodeId, + toCrawl, + targetDomainName + ); + + return ""; + } + private Object triggerNewCrawl(Request request, Response response) throws SQLException { int nodeId = Integer.parseInt(request.params("id")); diff --git a/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb b/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb index 2be78e28..32ab39a4 100644 --- a/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb +++ b/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb @@ -24,12 +24,20 @@

Summary

- + + +
DomainFileDomainFileCrawl
{{domain}} Download Parquet +
+ + + +
+

Contents

diff --git a/run/install.sh b/run/install.sh index 4b2bb359..636ee473 100755 --- a/run/install.sh +++ b/run/install.sh @@ -41,7 +41,7 @@ echo echo "1) barebones instance (1 node)" echo "2) barebones instance (2 nodes)" echo "3) full Marginalia Search instance?" -echo "4) non-docker install? (not recommended)" +echo "4) non-docker install? (proof-of-concept, not recommended)" echo read -p "Enter 1, 2, 3, or 4: " INSTANCE_TYPE @@ -149,17 +149,24 @@ elif [ "${INSTANCE_TYPE}" == "4" ]; then envsubst < install/docker-compose-scaffold.yml.template >${INSTALL_DIR}/docker-compose.yml cat < ${INSTALL_DIR}/README -Quick note about running Marginalia Search in a non-docker environment: +Quick note about running Marginalia Search in a non-docker environment. -* The template sets up a sample (in-docker) setup for - mariadb and zookeeper. These can also be run outside - of docker, but you will need to update the db.properties - file and "zookeeper-hosts" in the system.properties - file to point to the correct locations/addresses. -* Each service is spawned by the same launcher. When building - the project with "gradlew assemble", the launcher is put in - "code/services-core/single-service-runner/build/distributions/marginalia.tar". - This needs to be extracted. +Beware that this installation mode is more of a proof-of-concept and demonstration that the +system is not unhealthily dependent on docker, than a production-ready setup, and is not +recommended for production use! The container setup is much more robust and easier to manage. + +Note: This script only sets up an install directory, and does not build the system. +You will need to build the system with "gradlew assemble" before you can run it. + +Each service is spawned by the same launcher. After building the project with +"gradlew assemble", the launcher is put in "code/services-core/single-service-runner/build/distributions/marginalia.tar". +This needs to be extracted! + +Note: The template sets up a sample (in-docker) setup for mariadb and zookeeper. These can also be run outside +of docker, but you will need to update the db.properties file and "zookeeper-hosts" in the system.properties +file to point to the correct locations/addresses. + +Running: To launch a process you need to unpack it, and then run the launcher with the appropriate arguments. For example: @@ -177,13 +184,16 @@ A working setup needs at all the services * index [ http port is internal ] * executor [ http port is internal ] -The index and executor services should be on the same partition e.g. index:1 and executor:1, -which should be a number larger than 0. You can have multiple pairs of index and executor partitions, -but the pair should run on the same physical machine with the same install directory. +Since you will need to manage ports yourself, you must assign distinct ports-pairs to each service. -The query service can use any partition number. +* An index and executor services should exist on the same partition e.g. index:1 and executor:1. The partition +number is the last digit of the service name, and should be positive. You can have multiple pairs of index +and executor partitions, but the pair should run on the same physical machine with the same install directory. + +* The query service can use any partition number. + +* The control service should be on partition 1. -The control service should be on partition 1. EOF echo diff --git a/run/readme.md b/run/readme.md index 0a890feb..041f5576 100644 --- a/run/readme.md +++ b/run/readme.md @@ -3,11 +3,11 @@ This directory is a staging area for running the system. It contains scripts and templates for installing the system on a server, and for running it locally. -See [https://docs.marginalia.nu/](https://docs.marginalia.nu/) for additional -documentation. - ## Requirements +**x86-64 Linux** - The system is only tested on x86-64 Linux. It may work on other +platforms, but for lack of suitable hardware, this can not be guaranteed. + **Docker** - It is a bit of a pain to install, but if you follow [this guide](https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository) you're on the right track for ubuntu-like systems. @@ -15,7 +15,12 @@ documentation. The civilized way of installing this is to use [SDKMAN](https://sdkman.io/); graalce is a good distribution choice but it doesn't matter too much. -## Set up +## Quick Set up + +[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install +and operation of the search engine. This is a quick guide for the impatient. + +--- To go from a clean check out of the git repo to a running search engine, follow these steps. @@ -51,6 +56,8 @@ you for which installation mode you want to use. The options are: 2. Full Marginalia Search instance - This will install an instance of the search engine configured like [search.marginalia.nu](https://search.marginalia.nu). This is useful for local development and testing. +3. Non-docker installation - This will install the system outside of docker. + This is still an experimental run-mode. It will also prompt you for account details for a new mariadb instance, which will be created for you. The database will be initialized with the schema and data required diff --git a/settings.gradle b/settings.gradle index 2daa7997..3d710034 100644 --- a/settings.gradle +++ b/settings.gradle @@ -210,8 +210,8 @@ dependencyResolutionManagement { library('sqlite','org.xerial','sqlite-jdbc').version('3.41.2.2') library('javax.annotation','javax.annotation','javax.annotation-api').version('1.3.2') - library('parquet-column', 'org.apache.parquet','parquet-column').version('1.13.1') - library('parquet-hadoop', 'org.apache.parquet','parquet-hadoop').version('1.13.1') + library('parquet-column', 'org.apache.parquet','parquet-column').version('1.14.0') + library('parquet-hadoop', 'org.apache.parquet','parquet-hadoop').version('1.14.0') library('curator-framework', 'org.apache.curator','curator-framework').version('5.6.0') library('curator-x-discovery', 'org.apache.curator','curator-x-discovery').version('5.6.0') diff --git a/third-party/parquet-floor/build.gradle b/third-party/parquet-floor/build.gradle index 7b0de520..08443bb0 100644 --- a/third-party/parquet-floor/build.gradle +++ b/third-party/parquet-floor/build.gradle @@ -9,7 +9,7 @@ java { } dependencies { - implementation ('org.apache.parquet:parquet-column:1.13.1') { + implementation ('org.apache.parquet:parquet-column:1.14.0') { transitive = true } implementation('org.apache.parquet:parquet-hadoop:1.13.1') { diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java index a9c3231d..9b0fda55 100644 --- a/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java @@ -1,6 +1,7 @@ package org.apache.hadoop.conf; public class Configuration { + public Configuration(boolean x) {} public boolean getBoolean(String x, boolean y) { return y;