Merge branch 'master' into term-positions

This commit is contained in:
Viktor 2024-07-15 07:05:31 +02:00 committed by GitHub
commit 8ed5b51a32
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
29 changed files with 521 additions and 128 deletions

View File

@ -44,6 +44,15 @@ public class ExecutorCrawlClient {
.build());
}
public void triggerRecrawlSingleDomain(int node, FileStorageId fid, String domainName) {
channelPool.call(ExecutorCrawlApiBlockingStub::triggerSingleDomainRecrawl)
.forNode(node)
.run(RpcFileStorageIdWithDomainName.newBuilder()
.setFileStorageId(fid.id())
.setTargetDomainName(domainName)
.build());
}
public void triggerConvert(int node, FileStorageId fid) {
channelPool.call(ExecutorCrawlApiBlockingStub::triggerConvert)
.forNode(node)

View File

@ -22,6 +22,7 @@ service ExecutorApi {
service ExecutorCrawlApi {
rpc triggerCrawl(RpcFileStorageId) returns (Empty) {}
rpc triggerRecrawl(RpcFileStorageId) returns (Empty) {}
rpc triggerSingleDomainRecrawl(RpcFileStorageIdWithDomainName) returns (Empty) {}
rpc triggerConvert(RpcFileStorageId) returns (Empty) {}
rpc triggerConvertAndLoad(RpcFileStorageId) returns (Empty) {}
rpc loadProcessedData(RpcFileStorageIds) returns (Empty) {}
@ -55,6 +56,10 @@ message RpcProcessId {
message RpcFileStorageId {
int64 fileStorageId = 1;
}
message RpcFileStorageIdWithDomainName {
int64 fileStorageId = 1;
string targetDomainName = 2;
}
message RpcFileStorageIds {
repeated int64 fileStorageIds = 1;
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.actor;
public enum ExecutorActor {
CRAWL,
RECRAWL,
RECRAWL_SINGLE_DOMAIN,
CONVERT_AND_LOAD,
PROC_CONVERTER_SPAWNER,
PROC_LOADER_SPAWNER,

View File

@ -26,6 +26,7 @@ public class ExecutorActorControlService {
private final ExecutorActorStateMachines stateMachines;
public Map<ExecutorActor, ActorPrototype> actorDefinitions = new HashMap<>();
private final int node;
@Inject
public ExecutorActorControlService(MessageQueueFactory messageQueueFactory,
BaseServiceParams baseServiceParams,
@ -33,6 +34,7 @@ public class ExecutorActorControlService {
ConvertAndLoadActor convertAndLoadActor,
CrawlActor crawlActor,
RecrawlActor recrawlActor,
RecrawlSingleDomainActor recrawlSingleDomainActor,
RestoreBackupActor restoreBackupActor,
ConverterMonitorActor converterMonitorFSM,
CrawlerMonitorActor crawlerMonitorActor,
@ -57,6 +59,8 @@ public class ExecutorActorControlService {
register(ExecutorActor.CRAWL, crawlActor);
register(ExecutorActor.RECRAWL, recrawlActor);
register(ExecutorActor.RECRAWL_SINGLE_DOMAIN, recrawlSingleDomainActor);
register(ExecutorActor.CONVERT, convertActor);
register(ExecutorActor.RESTORE_BACKUP, restoreBackupActor);
register(ExecutorActor.CONVERT_AND_LOAD, convertAndLoadActor);

View File

@ -50,7 +50,9 @@ public class CrawlActor extends RecordActorPrototype {
storageService.relateFileStorages(storage.id(), dataArea.id());
// Send convert request
long msgId = mqCrawlerOutbox.sendAsync(new CrawlRequest(List.of(fid), dataArea.id()));
long msgId = mqCrawlerOutbox.sendAsync(
CrawlRequest.forSpec(fid, dataArea.id())
);
yield new Crawl(msgId);
}

View File

@ -59,7 +59,7 @@ public class RecrawlActor extends RecordActorPrototype {
refreshService.synchronizeDomainList();
long id = mqCrawlerOutbox.sendAsync(new CrawlRequest(null, fid));
long id = mqCrawlerOutbox.sendAsync(CrawlRequest.forRecrawl(fid));
yield new Crawl(id, fid, cascadeLoad);
}

View File

@ -0,0 +1,85 @@
package nu.marginalia.actor.task;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.actor.state.Resume;
import nu.marginalia.mq.MqMessageState;
import nu.marginalia.mq.outbox.MqOutbox;
import nu.marginalia.mqapi.crawling.CrawlRequest;
import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType;
@Singleton
public class RecrawlSingleDomainActor extends RecordActorPrototype {
private final MqOutbox mqCrawlerOutbox;
private final FileStorageService storageService;
private final ActorProcessWatcher processWatcher;
/** Initial step
* @param storageId - the id of the storage to recrawl
* @param targetDomainName - domain to be recrawled
*/
public record Initial(FileStorageId storageId, String targetDomainName) implements ActorStep {}
/** The action step */
@Resume(behavior = ActorResumeBehavior.RETRY)
public record Crawl(long messageId) implements ActorStep {}
@Override
public ActorStep transition(ActorStep self) throws Exception {
return switch (self) {
case Initial (FileStorageId fid, String targetDomainName) -> {
var crawlStorage = storageService.getStorage(fid);
if (crawlStorage == null) yield new Error("Bad storage id");
if (crawlStorage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + crawlStorage.type());
long id = mqCrawlerOutbox.sendAsync(
CrawlRequest.forSingleDomain(targetDomainName, fid)
);
yield new Crawl(id);
}
case Crawl (long msgId) -> {
var rsp = processWatcher.waitResponse(
mqCrawlerOutbox,
ProcessService.ProcessId.CRAWLER,
msgId);
if (rsp.state() != MqMessageState.OK) {
yield new Error("Crawler failed");
}
yield new End();
}
default -> new End();
};
}
@Override
public String describe() {
return "Run the crawler only re-fetching a single domain";
}
@Inject
public RecrawlSingleDomainActor(ActorProcessWatcher processWatcher,
ProcessOutboxes processOutboxes,
FileStorageService storageService,
Gson gson)
{
super(gson);
this.processWatcher = processWatcher;
this.mqCrawlerOutbox = processOutboxes.getCrawlerOutbox();
this.storageService = storageService;
}
}

View File

@ -47,6 +47,22 @@ public class ExecutorCrawlGrpcService extends ExecutorCrawlApiGrpc.ExecutorCrawl
}
}
@Override
public void triggerSingleDomainRecrawl(RpcFileStorageIdWithDomainName request, StreamObserver<Empty> responseObserver) {
try {
actorControlService.startFrom(ExecutorActor.RECRAWL_SINGLE_DOMAIN,
new RecrawlSingleDomainActor.Initial(
FileStorageId.of(request.getFileStorageId()),
request.getTargetDomainName()));
responseObserver.onNext(Empty.getDefaultInstance());
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
}
}
@Override
public void triggerConvert(RpcFileStorageId request, StreamObserver<Empty> responseObserver) {
try {

View File

@ -6,6 +6,7 @@ import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
@ -24,6 +25,10 @@ public class AnchorTagsImpl implements AnchorTagsSource {
logger.info("Loading atags from " + atagsPath);
if (!Files.exists(atagsPath)) {
throw new IllegalArgumentException("atags file does not exist: " + atagsPath);
}
try (var stmt = duckdbConnection.createStatement()) {
// Insert the domains into a temporary table, then use that to filter the atags table
@ -35,13 +40,18 @@ public class AnchorTagsImpl implements AnchorTagsSource {
}
}
// Project the atags table down to only the relevant domains. This looks like an SQL injection
// vulnerability if you're a validation tool, but the string comes from a trusted source.
// This is a SQL injection vulnerability if you're a validation tool, but the string comes from a trusted source
// -- we validate nonetheless to present a better error message
String path = atagsPath.toAbsolutePath().toString();
if (path.contains("'")) {
throw new IllegalArgumentException("atags file path contains a single quote: " + path + " and would break the query.");
}
stmt.executeUpdate("""
create table atags as
select * from '%s'
where dest in (select * from domains)
""".formatted(atagsPath.toAbsolutePath()));
""".formatted(path));
// Free up the memory used by the domains table
stmt.executeUpdate("drop table domains");

1
code/libraries/array/cpp/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
resources/libcpp.so

2
code/libraries/array/cpp/compile.sh Normal file → Executable file
View File

@ -7,4 +7,4 @@ if ! which ${CXX} > /dev/null; then
exit 0
fi
${CXX} -O3 -march=native -shared -Isrc/main/public src/main/cpp/*.cpp -o resources/libcpp.so
${CXX} -O3 -march=native -std=c++14 -shared -Isrc/main/public src/main/cpp/*.cpp -o resources/libcpp.so

View File

@ -14,8 +14,24 @@ public class CrawlRequest {
*/
public List<FileStorageId> specStorage;
/** (optional) Name of a single domain to be re-crawled */
public String targetDomainName;
/** File storage where the crawl data will be written. If it contains existing crawl data,
* this crawl data will be referenced for e-tags and last-mofified checks.
*/
public FileStorageId crawlStorage;
public static CrawlRequest forSpec(FileStorageId specStorage, FileStorageId crawlStorage) {
return new CrawlRequest(List.of(specStorage), null, crawlStorage);
}
public static CrawlRequest forSingleDomain(String targetDomainName, FileStorageId crawlStorage) {
return new CrawlRequest(null, targetDomainName, crawlStorage);
}
public static CrawlRequest forRecrawl(FileStorageId crawlStorage) {
return new CrawlRequest(null, null, crawlStorage);
}
}

View File

@ -23,6 +23,7 @@ import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.CrawlerOutputFile;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.crawlspec.CrawlSpecFileNames;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
@ -136,7 +137,12 @@ public class CrawlerMain extends ProcessMainClass {
var instructions = crawler.fetchInstructions();
try {
crawler.run(instructions.specProvider, instructions.outputDir);
if (instructions.targetDomainName != null) {
crawler.runForSingleDomain(instructions.targetDomainName, instructions.outputDir);
}
else {
crawler.run(instructions.specProvider, instructions.outputDir);
}
instructions.ok();
} catch (Exception ex) {
logger.error("Crawler failed", ex);
@ -200,6 +206,26 @@ public class CrawlerMain extends ProcessMainClass {
}
}
public void runForSingleDomain(String targetDomainName, Path outputDir) throws Exception {
heartbeat.start();
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log"));
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName)))
) {
var spec = new CrawlSpecRecord(targetDomainName, 1000, null);
var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog);
task.run();
}
catch (Exception ex) {
logger.warn("Exception in crawler", ex);
}
finally {
heartbeat.shutDown();
}
}
class CrawlTask implements SimpleBlockingThreadPool.Task {
private final CrawlSpecRecord specification;
@ -216,7 +242,8 @@ public class CrawlerMain extends ProcessMainClass {
AnchorTagsSource anchorTagsSource,
Path outputDir,
WarcArchiverIf warcArchiver,
WorkLog workLog) {
WorkLog workLog)
{
this.specification = specification;
this.anchorTagsSource = anchorTagsSource;
this.outputDir = outputDir;
@ -303,11 +330,19 @@ public class CrawlerMain extends ProcessMainClass {
private final MqMessage message;
private final MqSingleShotInbox inbox;
CrawlRequest(CrawlSpecProvider specProvider, Path outputDir, MqMessage message, MqSingleShotInbox inbox) {
private final String targetDomainName;
CrawlRequest(CrawlSpecProvider specProvider,
String targetDomainName,
Path outputDir,
MqMessage message,
MqSingleShotInbox inbox)
{
this.message = message;
this.inbox = inbox;
this.specProvider = specProvider;
this.outputDir = outputDir;
this.targetDomainName = targetDomainName;
}
@ -325,6 +360,7 @@ public class CrawlerMain extends ProcessMainClass {
var inbox = messageQueueFactory.createSingleShotInbox(CRAWLER_INBOX, node, UUID.randomUUID());
logger.info("Waiting for instructions");
var msgOpt = getMessage(inbox, nu.marginalia.mqapi.crawling.CrawlRequest.class.getSimpleName());
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
@ -350,6 +386,7 @@ public class CrawlerMain extends ProcessMainClass {
return new CrawlRequest(
specProvider,
request.targetDomainName,
crawlData.asPath(),
msg,
inbox);

View File

@ -183,6 +183,8 @@ public class HttpFetcherImpl implements HttpFetcher {
getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept-Language", "en,*;q=0.5")
.addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8")
.addHeader("User-agent", userAgentString);
contentTags.paint(getBuilder);
@ -225,6 +227,7 @@ public class HttpFetcherImpl implements HttpFetcher {
getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept", "text/*, */*;q=0.9")
.addHeader("User-agent", userAgentString);
HttpFetchResult result = recorder.fetch(client, getBuilder.build());

View File

@ -32,7 +32,7 @@ public class NoSecuritySSL {
@SneakyThrows
public static SSLSocketFactory buildSocketFactory() {
// Install the all-trusting trust manager
final SSLContext sslContext = SSLContext.getInstance("SSL");
final SSLContext sslContext = SSLContext.getInstance("TLS");
sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
var clientSessionContext = sslContext.getClientSessionContext();

View File

@ -8,7 +8,7 @@ import java.security.NoSuchAlgorithmException;
class WarcDigestBuilder {
private final MessageDigest digest;
private static final String digestAlgorithm = "SHA-1";
private static final String digestAlgorithm = "SHA-256";
public WarcDigestBuilder() throws NoSuchAlgorithmException {
this.digest = MessageDigest.getInstance(digestAlgorithm);

View File

@ -1,6 +1,6 @@
// This sets the data-has-js attribute on the body tag to true, so we can style the page with the assumption that
// This sets the data-has-js attribute on the html tag to true, so we can style the page with the assumption that
// the browser supports JS. This is a progressive enhancement, so the page will still work without JS.
document.getElementsByTagName('body')[0].setAttribute('data-has-js', 'true');
document.documentElement.setAttribute('data-has-js', 'true');
// To prevent the filter menu from being opened when the user hits enter on the search box, we need to add a keydown
// handler to the search box that stops the event from propagating. Janky hack, but it works.

View File

@ -1,33 +1,102 @@
$nicotine-dark: #acae89;
$nicotine-light: #f8f8ee;
$fg-dark: #000;
$fg-light: #fff;
$highlight-dark: #2f4858;
$highlight-light: #3F5F6F;
$highlight-light2: #eee;
$border-color: #ccc;
$border-color2: #aaa;
$heading-fonts: serif;
$visited: #fcc;
:root {
color-scheme: light;
--clr-bg-page: hsl(60, 42%, 95%); // $nicotine-light
--clr-bg-ui: hsl(0, 0%, 100%);
--clr-text-ui: #000; // $fg-dark
--clr-bg-theme: hsl(200, 28%, 34%); // $highlight-light
--clr-text-theme: #fff; // $fg-light
--clr-bg-highlight: hsl(0, 0%, 93%); // $highlight-light2
--clr-text-highlight: #111111;
--clr-bg-accent: hsl(63, 19%, 61%); // $nicotine-dark
--clr-border-accent: hsl(63, 19%, 35%);
--clr-border: #aaa; // $border-color2
--clr-shadow: var(--clr-border);
--clr-link: #0066cc;
--clr-link-visited: #531a89;
--clr-heading-link-visited: #fcc; // $visited
--font-family: sans-serif;
--font-size: 14px;
--font-family-heading: serif; // $heading-fonts
}
@mixin dark-theme-mixin {
color-scheme: dark;
--clr-bg-page: hsl(0, 0%, 6%);
--clr-bg-ui: hsl(0, 0%, 18%);
--clr-text-ui: #ddd;
--clr-bg-theme: hsl(0, 0%, 2%);
--clr-text-theme: var(--clr-text-ui);
--clr-bg-highlight: hsl(0, 0%, 11%);
--clr-text-highlight: #fff;
--clr-bg-accent: hsl(200, 32%, 28%);
--clr-border-accent: hsl(200, 8%, 12%);
--clr-border: hsl(0, 0%, 30%);
--clr-shadow: #000;
--clr-link: #8a8aff;
--clr-link-visited: #ffadff;
--clr-heading-link-visited: var(--clr-link-visited);
}
:root[data-theme='dark'] {
@include dark-theme-mixin;
}
// Makes theme match the user's OS preference when JS is disabled
@media (prefers-color-scheme: dark) {
:root:not([data-has-js="true"]) {
@include dark-theme-mixin;
}
}
* {
box-sizing: border-box;
}
a {
color: var(--clr-link);
}
a:visited {
color: var(--clr-link-visited);
}
input, textarea, select {
color: inherit;
}
h1 a, h2 a {
color: $fg-light;
color: var(--clr-text-theme);
}
h1 a:visited, h2 a:visited {
color: $visited;
color: var(--clr-heading-link-visited);
}
progress {
width: 10ch;
}
body {
background-color: $nicotine-light;
color: $fg-dark;
font-family: sans-serif;
font-size: 14px;
background-color: var(--clr-bg-page);
color: var(--clr-text-ui);
font-family: var(--font-family);
font-size: var(--font-size);
line-height: 1.6;
margin-left: auto;
margin-right: auto;
@ -99,28 +168,28 @@ body {
li {
display: inline;
padding: 1ch;
background-color: $highlight-light2;
background-color: var(--clr-bg-highlight);
a {
text-decoration: none;
display: inline-block;
color: #000;
color: var(--clr-text-highlight);
}
}
li.current {
background-color: $highlight-light;
background-color: var(--clr-bg-theme);
a {
color: #fff;
color: var(--clr-text-theme);
}
}
}
}
.dialog {
border: 1px solid $border-color2;
box-shadow: 0 0 1ch $border-color;
background-color: #fff;
border: 1px solid var(--clr-border);
box-shadow: 0 0 1ch var(--clr-shadow);
background-color: var(--clr-bg-ui);
padding: 1ch;
h2 {
@ -129,43 +198,58 @@ body {
font-weight: normal;
padding: 0.5ch;
font-size: 12pt;
background-color: $highlight-light;
color: #fff;
background-color: var(--clr-bg-theme);
color: var(--clr-text-theme);
}
}
header {
background-color: $nicotine-dark;
color: #fff;
border: 1px solid #888;
box-shadow: 0 0 0.5ch #888;
background-color: var(--clr-bg-accent);
border: 1px solid var(--clr-border-accent);
color: var(--clr-text-ui);
box-shadow: 0 0 0.5ch var(--clr-shadow);
margin-bottom: 1ch;
display: flex;
align-items: center;
justify-content: space-between;
nav {
a {
text-decoration: none;
color: #000;
color: var(--clr-text-ui);
padding: .5ch;
display: inline-block;
}
a:visited {
color: var(--clr-text-ui);
}
a.extra {
background: #ccc linear-gradient(45deg,
rgba(255,100,100,1) 0%,
rgba(100,255,100,1) 50%,
rgba(100,100,255,1) 100%);
hsl(0, 100%, 70%) 0%,
hsl(120, 100%, 70%) 50%,
hsl(240, 100%, 70%) 100%);
color: black;
text-shadow: 0 0 0.5ch #fff;
}
a:hover, a:focus {
background: #2f4858;
color: #fff !important;
background: var(--clr-bg-theme);
color: var(--clr-text-theme);
}
}
}
#theme {
padding: .5ch;
display: none;
[data-has-js='true'] & {
display: block;
}
}
#complaint {
@extend .dialog;
max-width: 60ch;
@ -210,11 +294,11 @@ header {
@extend .heading;
}
background-color: #fff;
background-color: var(--clr-bg-ui);
padding: 1ch;
margin: 1ch;
border: 1px solid $border-color2;
box-shadow: 0 0 1ch $border-color;
border: 1px solid var(--clr-border);
box-shadow: 0 0 1ch var(--clr-shadow);
}
section.cards {
@ -226,11 +310,10 @@ section.cards {
justify-content: flex-start;
.card {
border: 2px #ccc;
background-color: #fff;
background-color: var(--clr-bg-ui);
border-left: 1px solid #ecb;
border-top: 1px solid #ecb;
box-shadow: #0008 0 0 5px;
box-shadow: var(--clr-shadow) 0 0 5px;
h2 {
@extend .heading;
@ -239,7 +322,7 @@ section.cards {
h2 a {
display: block !important;
color: #fff;
color: inherit;
text-decoration: none;
}
a:focus img {
@ -271,12 +354,17 @@ section.cards {
padding-right: 1ch;
line-height: 1.6;
}
[data-theme='dark'] & {
border: 1px solid var(--clr-border);
}
}
}
.positions {
box-shadow: 0 0 2px #888;
background-color: #e4e4e4;
box-shadow: 0 0 2px var(--clr-shadow);
backdrop-filter: brightness(90%);
color: var(--clr-text-highlight);
padding: 2px;
margin-right: -1ch;
margin-left: 1ch;
@ -297,13 +385,13 @@ footer {
h1 {
font-weight: normal;
border-bottom: 4px solid $highlight-light;
border-bottom: 4px solid var(--clr-bg-theme);
}
h2 {
font-size: 14pt;
font-weight: normal;
border-bottom: 2px solid $highlight-dark;
border-bottom: 2px solid var(--clr-bg-theme);
width: 80%;
}
@ -312,9 +400,9 @@ footer {
flex-basis: 40ch;
flex-grow: 1.1;
background-color: #fff;
border-left: 1px solid $border-color2;
box-shadow: -1px -1px 5px $border-color;
background-color: var(--clr-bg-ui);
border-left: 1px solid var(--clr-border);
box-shadow: -1px -1px 5px var(--clr-shadow);
padding-left: 1ch;
padding-right: 1ch;
@ -329,18 +417,18 @@ footer {
}
.shadowbox {
box-shadow: 0 0 1ch $border-color2;
border: 1px solid $border-color;
box-shadow: 0 0 1ch var(--clr-shadow);
border: 1px solid var(--clr-border);
}
.heading {
margin: 0;
padding: 0.5ch;
background-color: $highlight-light;
border-bottom: 1px solid $border-color2;
font-family: $heading-fonts;
background-color: var(--clr-bg-theme);
border-bottom: 1px solid var(--clr-border);
font-family: var(--font-family-heading);
font-weight: normal;
color: $fg-light;
color: var(--clr-text-theme);
font-size: 12pt;
word-break: break-word;
}
@ -440,7 +528,7 @@ footer {
@extend .shadowbox;
padding: 0.5ch;
background-color: $fg-light;
background-color: var(--clr-bg-ui);
display: grid;
grid-template-columns: max-content 0 auto max-content;
grid-gap: 0.5ch;
@ -452,12 +540,13 @@ footer {
padding: 0.5ch;
font-size: 14pt;
word-break: keep-all;
background-color: $highlight-light;
color: $fg-light;
font-family: $heading-fonts;
background-color: var(--clr-bg-theme);
color: var(--clr-text-theme);
font-family: var(--font-family-heading);
font-weight: normal;
border: 1px solid;
text-align: center;
display: flex;
justify-content: space-between;
}
#suggestions-anchor {
@ -469,18 +558,18 @@ footer {
font-family: monospace;
font-size: 12pt;
padding: 0.5ch;
border: 1px solid $border-color2;
background-color: $fg-light;
color: $fg-dark;
border: 1px solid var(--clr-border);
background-color: inherit;
}
input[type="submit"] {
font-size: 12pt;
border: 1px solid $border-color2;
background-color: $fg-light;
color: $fg-dark;
border: 1px solid var(--clr-border);
background-color: var(--clr-bg-ui);
cursor: pointer;
}
// white suggesitons looks fine in dark mode
.suggestions {
background-color: #fff;
padding: .5ch;
@ -491,7 +580,7 @@ footer {
width: 300px;
border-left: 1px solid #ccc;
border-top: 1px solid #ccc;
box-shadow: 5px 5px 5px #888;
box-shadow: 5px 5px 5px var(--clr-shadow);
z-index: 10;
a {
@ -528,22 +617,22 @@ footer {
#filters {
@extend .shadowbox;
margin-top: 1ch;
background-color: $fg-light;
background-color: var(--clr-bg-ui);
h2 {
@extend .heading;
background-color: $highlight-light;
background-color: var(--clr-bg-theme);
}
h3 {
@extend .heading;
background-color: $highlight-light2;
background-color: var(--clr-bg-highlight);
color: var(--clr-text-highlight);
font-family: sans-serif;
color: #000;
border-bottom: 1px solid #000;
}
hr {
border-top: 0.5px solid $border-color2;
border-top: 0.5px solid var(--clr-border);
border-bottom: none;
}
ul {
@ -553,17 +642,17 @@ footer {
li {
padding: 1ch;
a {
color: $fg-dark;
color: inherit;
text-decoration: none;
}
a:hover, a:focus {
border-bottom: 1px solid $highlight-light;
border-bottom: 1px solid var(--clr-bg-theme);
}
}
li.current {
border-left: 4px solid $highlight-light;
background-color: $highlight-light2;
border-left: 4px solid var(--clr-bg-theme);
background-color: var(--clr-bg-highlight);
a {
margin-left: -4px;
}
@ -576,46 +665,46 @@ footer {
margin: 1ch 0 2ch 0;
.url {
background-color: $highlight-light;
background-color: var(--clr-bg-theme);
padding-left: 0.5ch;
a {
word-break: break-all;
font-family: monospace;
font-size: 8pt;
color: $fg-light;
color: var(--clr-text-theme);
text-shadow: 0 0 1ch #000; // guarantee decent contrast across background colors
}
a:visited {
color: $visited;
color: var(--clr-heading-link-visited);
}
}
h2 {
a {
word-break: break-all;
color: $fg-dark;
color: var(--clr-text-ui);
text-decoration: none;
}
font-size: 12pt;
@extend .heading;
background-color: $highlight-light2;
background-color:var(--clr-bg-highlight);
}
.description {
background-color: $fg-light;
background-color: var(--clr-bg-ui);
word-break: break-word;
padding: 1ch;
margin: 0;
}
ul.additional-results {
background-color: $fg-light;
background-color: var(--clr-bg-ui);
padding: 1ch;
list-style: none;
margin: 0;
a {
color: $fg-dark;
color: inherit;
}
}
}
@ -631,7 +720,7 @@ footer {
display: flex;
font-size: 10pt;
padding: 1ch;
background-color: #eee;
background-color: var(--clr-bg-highlight);
> * {
margin-right: 1ch;
@ -645,12 +734,12 @@ footer {
padding-left: 4px;
}
a {
color: #000;
color: var(--clr-text-highlight);
}
}
@media (max-device-width: 624px) {
body[data-has-js="true"] { // This property is set via js so we can selectively enable these changes only if JS is enabled;
[data-has-js="true"] body { // This property is set via js so we can selectively enable these changes only if JS is enabled;
// This is desirable since mobile navigation is JS-driven. If JS is disabled, having a squished
// GUI is better than having no working UI.
margin: 0 !important;
@ -666,6 +755,8 @@ footer {
#mcfeast {
display: inline;
float: right;
width: 2rem;
font-size: 1rem;
}
#menu-close {

View File

@ -0,0 +1,57 @@
function getTheme() {
const theme = window.localStorage.getItem('theme');
// if a valid theme is set in localStorage, return it
if (theme === 'dark' || theme === 'light') {
return { value: theme, system: false };
}
// if matchMedia is supported and OS theme is dark
if (window.matchMedia('(prefers-color-scheme: dark)').matches) {
return { value: 'dark', system: true };
}
return { value: 'light', system: true };
}
function setTheme(value) {
if (value === 'dark' || value === 'light') {
window.localStorage.setItem('theme', value);
} else {
window.localStorage.removeItem('theme');
}
const theme = getTheme();
document.documentElement.setAttribute('data-theme', theme.value);
}
function initializeTheme() {
const themeSelect = document.getElementById('theme-select');
const theme = getTheme();
document.documentElement.setAttribute('data-theme', theme.value);
// system is selected by default in the themeSwitcher so ignore it here
if (!theme.system) {
themeSelect.value = theme.value;
}
themeSelect.addEventListener('change', e => {
setTheme(e.target.value);
});
const mql = window.matchMedia('(prefers-color-scheme: dark)');
// if someone changes their theme at the OS level we need to update
// their theme immediately if they're using their OS theme
mql.addEventListener('change', e => {
if (themeSelect.value !== 'system') return;
if (e.matches) setTheme('dark');
else setTheme('light');
});
}
initializeTheme();

View File

@ -27,7 +27,7 @@ function setupTypeahead() {
for (i=0;i<items.length;i++) {
item = document.createElement('a');
item.innerHTML=items[i];
item.textContent=items[i];
item.setAttribute('href', '#')
function suggestionClickHandler(e) {

View File

@ -7,4 +7,15 @@
<a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">Donate</a>
<a class="extra" href="https://search.marginalia.nu/explore/random">Random</a>
</nav>
<div id="theme">
<label for="theme-select" class="screenreader-only">Color Theme</label>
<select id="theme-select">
<option value="system" selected>System</option>
<option value="light">Light</option>
<option value="dark">Dark</option>
</select>
</div>
</header>
<!-- load this ASAP to avoid color theme flicker -->
<script src="/theme.js"></script>

View File

@ -86,7 +86,7 @@ public class ControlCrawlDataService {
ORDER BY httpStatus
""");
while (rs.next()) {
final boolean isCurrentFilter = selectedContentType.equals(rs.getString("httpStatus"));
final boolean isCurrentFilter = selectedHttpStatus.equals(rs.getString("httpStatus"));
final int status = rs.getInt("httpStatus");
final int cnt = rs.getInt("cnt");

View File

@ -24,6 +24,7 @@ import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.Set;
@Singleton
@ -88,6 +89,9 @@ public class ControlNodeActionsService {
Spark.post("/nodes/:id/actions/recrawl", this::triggerAutoRecrawl,
redirectControl.renderRedirectAcknowledgement("Recrawling", "..")
);
Spark.post("/nodes/:id/actions/recrawl-single-domain", this::triggerSingleDomainRecrawl,
redirectControl.renderRedirectAcknowledgement("Recrawling", "..")
);
Spark.post("/nodes/:id/actions/process", this::triggerProcess,
redirectControl.renderRedirectAcknowledgement("Processing", "..")
);
@ -216,6 +220,21 @@ public class ControlNodeActionsService {
return "";
}
private Object triggerSingleDomainRecrawl(Request request, Response response) throws SQLException {
int nodeId = Integer.parseInt(request.params("id"));
var toCrawl = parseSourceFileStorageId(request.queryParams("source"));
var targetDomainName = Objects.requireNonNull(request.queryParams("targetDomainName"));
crawlClient.triggerRecrawlSingleDomain(
nodeId,
toCrawl,
targetDomainName
);
return "";
}
private Object triggerNewCrawl(Request request, Response response) throws SQLException {
int nodeId = Integer.parseInt(request.params("id"));

View File

@ -24,12 +24,20 @@
<h2>Summary</h2>
<table class="table">
<tr>
<th>Domain</th><th>File</th>
<th>Domain</th><th>File</th><th>Crawl</th>
</tr>
<td>{{domain}}</td>
<td>
<a class="btn btn-primary" href="/nodes/{{node.id}}/storage/{{storage.id}}/transfer?path={{{path}}}">Download Parquet</a>
</td>
<td>
<form method="post" action="/nodes/{{node.id}}/actions/recrawl-single-domain" onsubmit="return confirm('Confirm recrawl of {{domain}}')">
<input type="hidden" name="source" value="{{storage.id}}">
<input type="hidden" name="targetDomainName" value="{{domain}}">
<button type="submit" class="btn btn-primary">Trigger Recrawl</button>
</form>
</td>
</table>
<h2>Contents</h2>

View File

@ -41,7 +41,7 @@ echo
echo "1) barebones instance (1 node)"
echo "2) barebones instance (2 nodes)"
echo "3) full Marginalia Search instance?"
echo "4) non-docker install? (not recommended)"
echo "4) non-docker install? (proof-of-concept, not recommended)"
echo
read -p "Enter 1, 2, 3, or 4: " INSTANCE_TYPE
@ -149,17 +149,24 @@ elif [ "${INSTANCE_TYPE}" == "4" ]; then
envsubst < install/docker-compose-scaffold.yml.template >${INSTALL_DIR}/docker-compose.yml
cat <<EOF > ${INSTALL_DIR}/README
Quick note about running Marginalia Search in a non-docker environment:
Quick note about running Marginalia Search in a non-docker environment.
* The template sets up a sample (in-docker) setup for
mariadb and zookeeper. These can also be run outside
of docker, but you will need to update the db.properties
file and "zookeeper-hosts" in the system.properties
file to point to the correct locations/addresses.
* Each service is spawned by the same launcher. When building
the project with "gradlew assemble", the launcher is put in
"code/services-core/single-service-runner/build/distributions/marginalia.tar".
This needs to be extracted.
Beware that this installation mode is more of a proof-of-concept and demonstration that the
system is not unhealthily dependent on docker, than a production-ready setup, and is not
recommended for production use! The container setup is much more robust and easier to manage.
Note: This script only sets up an install directory, and does not build the system.
You will need to build the system with "gradlew assemble" before you can run it.
Each service is spawned by the same launcher. After building the project with
"gradlew assemble", the launcher is put in "code/services-core/single-service-runner/build/distributions/marginalia.tar".
This needs to be extracted!
Note: The template sets up a sample (in-docker) setup for mariadb and zookeeper. These can also be run outside
of docker, but you will need to update the db.properties file and "zookeeper-hosts" in the system.properties
file to point to the correct locations/addresses.
Running:
To launch a process you need to unpack it, and then run the launcher with the
appropriate arguments. For example:
@ -177,13 +184,16 @@ A working setup needs at all the services
* index [ http port is internal ]
* executor [ http port is internal ]
The index and executor services should be on the same partition e.g. index:1 and executor:1,
which should be a number larger than 0. You can have multiple pairs of index and executor partitions,
but the pair should run on the same physical machine with the same install directory.
Since you will need to manage ports yourself, you must assign distinct ports-pairs to each service.
The query service can use any partition number.
* An index and executor services should exist on the same partition e.g. index:1 and executor:1. The partition
number is the last digit of the service name, and should be positive. You can have multiple pairs of index
and executor partitions, but the pair should run on the same physical machine with the same install directory.
* The query service can use any partition number.
* The control service should be on partition 1.
The control service should be on partition 1.
EOF
echo

View File

@ -3,11 +3,11 @@
This directory is a staging area for running the system. It contains scripts
and templates for installing the system on a server, and for running it locally.
See [https://docs.marginalia.nu/](https://docs.marginalia.nu/) for additional
documentation.
## Requirements
**x86-64 Linux** - The system is only tested on x86-64 Linux. It may work on other
platforms, but for lack of suitable hardware, this can not be guaranteed.
**Docker** - It is a bit of a pain to install, but if you follow
[this guide](https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository) you're on the right track for ubuntu-like systems.
@ -15,7 +15,12 @@ documentation.
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
graalce is a good distribution choice but it doesn't matter too much.
## Set up
## Quick Set up
[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
and operation of the search engine. This is a quick guide for the impatient.
---
To go from a clean check out of the git repo to a running search engine,
follow these steps.
@ -51,6 +56,8 @@ you for which installation mode you want to use. The options are:
2. Full Marginalia Search instance - This will install an instance of the search engine
configured like [search.marginalia.nu](https://search.marginalia.nu). This is useful
for local development and testing.
3. Non-docker installation - This will install the system outside of docker.
This is still an experimental run-mode.
It will also prompt you for account details for a new mariadb instance, which will be
created for you. The database will be initialized with the schema and data required

View File

@ -210,8 +210,8 @@ dependencyResolutionManagement {
library('sqlite','org.xerial','sqlite-jdbc').version('3.41.2.2')
library('javax.annotation','javax.annotation','javax.annotation-api').version('1.3.2')
library('parquet-column', 'org.apache.parquet','parquet-column').version('1.13.1')
library('parquet-hadoop', 'org.apache.parquet','parquet-hadoop').version('1.13.1')
library('parquet-column', 'org.apache.parquet','parquet-column').version('1.14.0')
library('parquet-hadoop', 'org.apache.parquet','parquet-hadoop').version('1.14.0')
library('curator-framework', 'org.apache.curator','curator-framework').version('5.6.0')
library('curator-x-discovery', 'org.apache.curator','curator-x-discovery').version('5.6.0')

View File

@ -9,7 +9,7 @@ java {
}
dependencies {
implementation ('org.apache.parquet:parquet-column:1.13.1') {
implementation ('org.apache.parquet:parquet-column:1.14.0') {
transitive = true
}
implementation('org.apache.parquet:parquet-hadoop:1.13.1') {

View File

@ -1,6 +1,7 @@
package org.apache.hadoop.conf;
public class Configuration {
public Configuration(boolean x) {}
public boolean getBoolean(String x, boolean y) {
return y;