mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(*) Overhaul settings and properties
Use a system.properties file to configure the system. This is loaded statically by MainClass or ProcessMainClass. Update the property names to be more consistent, and update the documentations to reflect the changes.
This commit is contained in:
parent
176b9c9666
commit
7c6e18f7a7
@ -64,7 +64,7 @@ public class IndexClient extends AbstractDynamicClient {
|
|||||||
.postGet(ctx, node, "/search/", specs, SearchResultSet.class).onErrorReturn(t -> new SearchResultSet())
|
.postGet(ctx, node, "/search/", specs, SearchResultSet.class).onErrorReturn(t -> new SearchResultSet())
|
||||||
.observeOn(Schedulers.io());
|
.observeOn(Schedulers.io());
|
||||||
} catch (RouteNotConfiguredException ex) {
|
} catch (RouteNotConfiguredException ex) {
|
||||||
return Observable.error(ex);
|
return Observable.empty();
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.reduce(SearchResultSet::combine)
|
.reduce(SearchResultSet::combine)
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
package nu.marginalia;
|
package nu.marginalia;
|
||||||
|
|
||||||
public record UserAgent(String uaString) {}
|
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||||
|
@ -12,19 +12,19 @@ import java.util.Optional;
|
|||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
public class WmsaHome {
|
public class WmsaHome {
|
||||||
public static UserAgent getUserAgent() throws IOException {
|
public static UserAgent getUserAgent() {
|
||||||
var uaPath = getHomePath().resolve("conf/user-agent");
|
|
||||||
|
|
||||||
if (!Files.exists(uaPath)) {
|
return new UserAgent(
|
||||||
throw new FileNotFoundException("Could not find " + uaPath);
|
System.getProperty("crawler.userAgentString", "Mozilla/5.0 (compatible; Marginalia-like bot; +https://git.marginalia.nu/))"),
|
||||||
}
|
System.getProperty("crawler.userAgentIdentifier", "search.marginalia.nu")
|
||||||
|
);
|
||||||
return new UserAgent(Files.readString(uaPath).trim());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Path getUploadDir() {
|
public static Path getUploadDir() {
|
||||||
return Path.of("/uploads");
|
return Path.of(
|
||||||
|
System.getProperty("executor.uploadDir", "/uploads")
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Path getHomePath() {
|
public static Path getHomePath() {
|
||||||
@ -93,11 +93,6 @@ public class WmsaHome {
|
|||||||
public static Path getAtagsPath() {
|
public static Path getAtagsPath() {
|
||||||
return getHomePath().resolve("data/atags.parquet");
|
return getHomePath().resolve("data/atags.parquet");
|
||||||
}
|
}
|
||||||
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
|
||||||
|
|
||||||
public static boolean isDebug() {
|
|
||||||
return debugMode;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -16,7 +16,7 @@ public class DomainBlacklistImpl implements DomainBlacklist {
|
|||||||
private volatile TIntHashSet spamDomainSet = new TIntHashSet();
|
private volatile TIntHashSet spamDomainSet = new TIntHashSet();
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final boolean blacklistDisabled = Boolean.getBoolean("no-domain-blacklist");
|
private final boolean blacklistDisabled = Boolean.getBoolean("blacklist.disable");
|
||||||
@Inject
|
@Inject
|
||||||
public DomainBlacklistImpl(HikariDataSource dataSource) {
|
public DomainBlacklistImpl(HikariDataSource dataSource) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
|
@ -0,0 +1,33 @@
|
|||||||
|
package nu.marginalia.service;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public class ConfigLoader {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ConfigLoader.class);
|
||||||
|
|
||||||
|
static Path getConfigPath(String configName) {
|
||||||
|
return WmsaHome.getHomePath().resolve("conf/properties/" + configName + ".properties");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void loadConfig(Path configPath) {
|
||||||
|
if (!Files.exists(configPath)) {
|
||||||
|
logger.info("No config file found at {}", configPath);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Loading config from {}", configPath);
|
||||||
|
|
||||||
|
try (var is = Files.newInputStream(configPath)) {
|
||||||
|
logger.info("Config:\n{}", Files.readString(configPath));
|
||||||
|
System.getProperties().load(is);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -15,7 +15,14 @@ import java.net.UnknownHostException;
|
|||||||
* They must also invoke init() in their main method.
|
* They must also invoke init() in their main method.
|
||||||
*/
|
*/
|
||||||
public abstract class MainClass {
|
public abstract class MainClass {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private static final Logger logger = LoggerFactory.getLogger(MainClass.class);
|
||||||
|
|
||||||
|
static {
|
||||||
|
// Load global config ASAP
|
||||||
|
ConfigLoader.loadConfig(
|
||||||
|
ConfigLoader.getConfigPath("system")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public MainClass() {
|
public MainClass() {
|
||||||
RxJavaPlugins.setErrorHandler(this::handleError);
|
RxJavaPlugins.setErrorHandler(this::handleError);
|
||||||
@ -42,11 +49,14 @@ public abstract class MainClass {
|
|||||||
|
|
||||||
|
|
||||||
protected static void init(ServiceId id, String... args) {
|
protected static void init(ServiceId id, String... args) {
|
||||||
|
|
||||||
System.setProperty("log4j2.isThreadContextMapInheritable", "true");
|
System.setProperty("log4j2.isThreadContextMapInheritable", "true");
|
||||||
System.setProperty("isThreadContextMapInheritable", "true");
|
System.setProperty("isThreadContextMapInheritable", "true");
|
||||||
System.setProperty("service-name", id.name);
|
System.setProperty("service-name", id.name);
|
||||||
|
|
||||||
|
ConfigLoader.loadConfig(
|
||||||
|
ConfigLoader.getConfigPath(id.name)
|
||||||
|
);
|
||||||
|
|
||||||
initJdbc();
|
initJdbc();
|
||||||
initPrometheus();
|
initPrometheus();
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,20 @@
|
|||||||
|
package nu.marginalia.service;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
public abstract class ProcessMainClass {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);
|
||||||
|
|
||||||
|
static {
|
||||||
|
// Load global config ASAP
|
||||||
|
ConfigLoader.loadConfig(
|
||||||
|
ConfigLoader.getConfigPath("system")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ProcessMainClass() {
|
||||||
|
new org.mariadb.jdbc.Driver();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -35,7 +35,7 @@ public class DatabaseModule extends AbstractModule {
|
|||||||
dbProperties = loadDbProperties();
|
dbProperties = loadDbProperties();
|
||||||
|
|
||||||
if (migrate) {
|
if (migrate) {
|
||||||
if (Boolean.getBoolean("disableFlyway")) {
|
if (Boolean.getBoolean("flyway.disable")) {
|
||||||
logger.info("Flyway disabled");
|
logger.info("Flyway disabled");
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -22,7 +22,7 @@ public class IpBlockList {
|
|||||||
private final GeoIpBlocklist geoIpBlocklist;
|
private final GeoIpBlocklist geoIpBlocklist;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final List<SubnetUtils.SubnetInfo> badSubnets = new ArrayList<>();
|
private final List<SubnetUtils.SubnetInfo> badSubnets = new ArrayList<>();
|
||||||
private final boolean blocklistDisabled = Boolean.getBoolean("no-ip-blocklist");
|
private final boolean blocklistDisabled = Boolean.getBoolean("ip-blocklist.disabled");
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IpBlockList(GeoIpBlocklist geoIpBlocklist) {
|
public IpBlockList(GeoIpBlocklist geoIpBlocklist) {
|
||||||
|
@ -6,24 +6,14 @@ import nu.marginalia.array.algo.IntArraySearch;
|
|||||||
import nu.marginalia.array.algo.IntArraySort;
|
import nu.marginalia.array.algo.IntArraySort;
|
||||||
import nu.marginalia.array.algo.IntArrayTransformations;
|
import nu.marginalia.array.algo.IntArrayTransformations;
|
||||||
import nu.marginalia.array.delegate.ShiftedIntArray;
|
import nu.marginalia.array.delegate.ShiftedIntArray;
|
||||||
import nu.marginalia.array.delegate.ShiftedLongArray;
|
|
||||||
import nu.marginalia.array.page.SegmentIntArray;
|
import nu.marginalia.array.page.SegmentIntArray;
|
||||||
import nu.marginalia.array.page.SegmentLongArray;
|
|
||||||
import nu.marginalia.array.scheme.ArrayPartitioningScheme;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.foreign.Arena;
|
import java.lang.foreign.Arena;
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
public interface IntArray extends IntArrayBase, IntArrayTransformations, IntArraySearch, IntArraySort {
|
public interface IntArray extends IntArrayBase, IntArrayTransformations, IntArraySearch, IntArraySort {
|
||||||
int WORD_SIZE = 4;
|
int WORD_SIZE = 4;
|
||||||
|
|
||||||
ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME
|
|
||||||
= ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE);
|
|
||||||
|
|
||||||
int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 16;
|
|
||||||
|
|
||||||
static IntArray allocate(long size) {
|
static IntArray allocate(long size) {
|
||||||
return SegmentIntArray.onHeap(Arena.ofShared(), size);
|
return SegmentIntArray.onHeap(Arena.ofShared(), size);
|
||||||
}
|
}
|
||||||
|
@ -11,6 +11,7 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
|||||||
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||||
import nu.marginalia.converting.writer.ConverterWriter;
|
import nu.marginalia.converting.writer.ConverterWriter;
|
||||||
|
import nu.marginalia.service.ProcessMainClass;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.mq.MqMessage;
|
import nu.marginalia.mq.MqMessage;
|
||||||
@ -38,7 +39,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||||||
|
|
||||||
import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX;
|
import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX;
|
||||||
|
|
||||||
public class ConverterMain {
|
public class ConverterMain extends ProcessMainClass {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class);
|
private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class);
|
||||||
private final DomainProcessor processor;
|
private final DomainProcessor processor;
|
||||||
private final Gson gson;
|
private final Gson gson;
|
||||||
|
@ -32,6 +32,7 @@ import java.util.*;
|
|||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class DomainProcessor {
|
public class DomainProcessor {
|
||||||
|
private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
|
||||||
private final DocumentProcessor documentProcessor;
|
private final DocumentProcessor documentProcessor;
|
||||||
private final SiteWords siteWords;
|
private final SiteWords siteWords;
|
||||||
private final AnchorTagsSource anchorTagsSource;
|
private final AnchorTagsSource anchorTagsSource;
|
||||||
@ -59,7 +60,7 @@ public class DomainProcessor {
|
|||||||
public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) {
|
public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) {
|
||||||
final int sizeHint = domain.sizeHint();
|
final int sizeHint = domain.sizeHint();
|
||||||
|
|
||||||
if (sizeHint > 10_000) {
|
if (sizeHint > SIDELOAD_THRESHOLD) {
|
||||||
// If the file is too big, we run a processing mode that doesn't
|
// If the file is too big, we run a processing mode that doesn't
|
||||||
// require loading the entire dataset into RAM
|
// require loading the entire dataset into RAM
|
||||||
return sideloadProcessing(domain, sizeHint);
|
return sideloadProcessing(domain, sizeHint);
|
||||||
|
@ -23,6 +23,7 @@ import nu.marginalia.crawling.io.CrawledDomainReader;
|
|||||||
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
||||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||||
import nu.marginalia.crawlspec.CrawlSpecFileNames;
|
import nu.marginalia.crawlspec.CrawlSpecFileNames;
|
||||||
|
import nu.marginalia.service.ProcessMainClass;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
@ -51,7 +52,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||||||
|
|
||||||
import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
||||||
|
|
||||||
public class CrawlerMain {
|
public class CrawlerMain extends ProcessMainClass {
|
||||||
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
|
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
|
||||||
|
|
||||||
private final UserAgent userAgent;
|
private final UserAgent userAgent;
|
||||||
@ -96,10 +97,10 @@ public class CrawlerMain {
|
|||||||
this.node = processConfiguration.node();
|
this.node = processConfiguration.node();
|
||||||
|
|
||||||
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||||
Integer.getInteger("crawler.pool-size", 256),
|
Integer.getInteger("crawler.poolSize", 256),
|
||||||
1);
|
1);
|
||||||
|
|
||||||
fetcher = new HttpFetcherImpl(userAgent.uaString(),
|
fetcher = new HttpFetcherImpl(userAgent,
|
||||||
new Dispatcher(),
|
new Dispatcher(),
|
||||||
new ConnectionPool(5, 10, TimeUnit.SECONDS)
|
new ConnectionPool(5, 10, TimeUnit.SECONDS)
|
||||||
);
|
);
|
||||||
|
@ -13,12 +13,12 @@ import java.util.Objects;
|
|||||||
public class ContentTypeProber {
|
public class ContentTypeProber {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class);
|
private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class);
|
||||||
private final String userAgent;
|
private final String userAgentString;
|
||||||
private final OkHttpClient client;
|
private final OkHttpClient client;
|
||||||
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
public ContentTypeProber(String userAgent, OkHttpClient httpClient) {
|
public ContentTypeProber(String userAgentString, OkHttpClient httpClient) {
|
||||||
this.userAgent = userAgent;
|
this.userAgentString = userAgentString;
|
||||||
this.client = httpClient;
|
this.client = httpClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ public class ContentTypeProber {
|
|||||||
logger.debug("Probing suspected binary {}", url);
|
logger.debug("Probing suspected binary {}", url);
|
||||||
|
|
||||||
var headBuilder = new Request.Builder().head()
|
var headBuilder = new Request.Builder().head()
|
||||||
.addHeader("User-agent", userAgent)
|
.addHeader("User-agent", userAgentString)
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.url(url.toString());
|
.url(url.toString());
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ import com.google.inject.name.Named;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.retreival.Cookies;
|
import nu.marginalia.crawl.retreival.Cookies;
|
||||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
|
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
|
||||||
@ -35,7 +36,8 @@ import java.util.concurrent.TimeUnit;
|
|||||||
public class HttpFetcherImpl implements HttpFetcher {
|
public class HttpFetcherImpl implements HttpFetcher {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final String userAgent;
|
private final String userAgentString;
|
||||||
|
private final String userAgentIdentifier;
|
||||||
private final Cookies cookies = new Cookies();
|
private final Cookies cookies = new Cookies();
|
||||||
|
|
||||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||||
@ -85,18 +87,20 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HttpFetcherImpl(@Named("user-agent") String userAgent,
|
public HttpFetcherImpl(UserAgent userAgent,
|
||||||
Dispatcher dispatcher,
|
Dispatcher dispatcher,
|
||||||
ConnectionPool connectionPool)
|
ConnectionPool connectionPool)
|
||||||
{
|
{
|
||||||
this.client = createClient(dispatcher, connectionPool);
|
this.client = createClient(dispatcher, connectionPool);
|
||||||
this.userAgent = userAgent;
|
this.userAgentString = userAgent.uaString();
|
||||||
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||||
|
this.contentTypeProber = new ContentTypeProber(userAgentString, client);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetcherImpl(@Named("user-agent") String userAgent) {
|
public HttpFetcherImpl(String userAgent) {
|
||||||
this.client = createClient(null, new ConnectionPool());
|
this.client = createClient(null, new ConnectionPool());
|
||||||
this.userAgent = userAgent;
|
this.userAgentString = userAgent;
|
||||||
|
this.userAgentIdentifier = userAgent;
|
||||||
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,7 +114,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public FetchResult probeDomain(EdgeUrl url) {
|
public FetchResult probeDomain(EdgeUrl url) {
|
||||||
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
|
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
|
||||||
.url(url.toString())
|
.url(url.toString())
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
@ -170,7 +174,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
getBuilder.url(url.toString())
|
getBuilder.url(url.toString())
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.addHeader("User-agent", userAgent);
|
.addHeader("User-agent", userAgentString);
|
||||||
|
|
||||||
contentTags.paint(getBuilder);
|
contentTags.paint(getBuilder);
|
||||||
|
|
||||||
@ -212,7 +216,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
getBuilder.url(url.toString())
|
getBuilder.url(url.toString())
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.addHeader("User-agent", userAgent);
|
.addHeader("User-agent", userAgentString);
|
||||||
|
|
||||||
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
|
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
|
||||||
|
|
||||||
@ -220,7 +224,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
robotsParser.parseContent(url.toString(),
|
robotsParser.parseContent(url.toString(),
|
||||||
body,
|
body,
|
||||||
contentType.toString(),
|
contentType.toString(),
|
||||||
userAgent)
|
userAgentIdentifier)
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ import com.google.inject.Inject;
|
|||||||
import nu.marginalia.IndexLocations;
|
import nu.marginalia.IndexLocations;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
import nu.marginalia.ProcessConfiguration;
|
||||||
import nu.marginalia.ProcessConfigurationModule;
|
import nu.marginalia.ProcessConfigurationModule;
|
||||||
|
import nu.marginalia.service.ProcessMainClass;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
@ -38,7 +39,7 @@ import java.util.function.LongPredicate;
|
|||||||
|
|
||||||
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
|
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
|
||||||
|
|
||||||
public class IndexConstructorMain {
|
public class IndexConstructorMain extends ProcessMainClass {
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final ProcessHeartbeatImpl heartbeat;
|
private final ProcessHeartbeatImpl heartbeat;
|
||||||
private final MessageQueueFactory messageQueueFactory;
|
private final MessageQueueFactory messageQueueFactory;
|
||||||
|
@ -8,6 +8,7 @@ import lombok.Getter;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
import nu.marginalia.ProcessConfiguration;
|
||||||
import nu.marginalia.ProcessConfigurationModule;
|
import nu.marginalia.ProcessConfigurationModule;
|
||||||
|
import nu.marginalia.service.ProcessMainClass;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||||
import nu.marginalia.loading.documents.DocumentLoaderService;
|
import nu.marginalia.loading.documents.DocumentLoaderService;
|
||||||
@ -37,7 +38,7 @@ import java.util.concurrent.TimeUnit;
|
|||||||
|
|
||||||
import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX;
|
import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX;
|
||||||
|
|
||||||
public class LoaderMain {
|
public class LoaderMain extends ProcessMainClass {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
|
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
|
||||||
|
|
||||||
private final ProcessHeartbeatImpl heartbeat;
|
private final ProcessHeartbeatImpl heartbeat;
|
||||||
|
@ -8,6 +8,7 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||||
import nu.marginalia.query.client.QueryClient;
|
import nu.marginalia.query.client.QueryClient;
|
||||||
|
import nu.marginalia.service.MainClass;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -22,7 +23,7 @@ import java.util.stream.IntStream;
|
|||||||
|
|
||||||
import static nu.marginalia.adjacencies.SparseBitVector.*;
|
import static nu.marginalia.adjacencies.SparseBitVector.*;
|
||||||
|
|
||||||
public class WebsiteAdjacenciesCalculator {
|
public class WebsiteAdjacenciesCalculator extends MainClass {
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
public AdjacenciesData adjacenciesData;
|
public AdjacenciesData adjacenciesData;
|
||||||
public DomainAliases domainAliases;
|
public DomainAliases domainAliases;
|
||||||
|
@ -19,7 +19,7 @@ public class SearchModule extends AbstractModule {
|
|||||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(
|
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(
|
||||||
System.getProperty("website-url", "https://search.marginalia.nu/")));
|
System.getProperty("search.websiteUrl", "https://search.marginalia.nu/")));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Provides
|
@Provides
|
||||||
|
@ -25,7 +25,8 @@ public class ControlRendererFactory {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public Renderer renderer(String template) {
|
public Renderer renderer(String template) {
|
||||||
Map<String, Object> globalContext = Map.of(
|
Map<String, Object> globalContext = Map.of(
|
||||||
"nodes", nodeConfigurationService.getAll()
|
"nodes", nodeConfigurationService.getAll(),
|
||||||
|
"hideMarginaliaApp", Boolean.getBoolean("control.hideMarginaliaApp")
|
||||||
);
|
);
|
||||||
var baseRenderer = rendererFactory.renderer(template);
|
var baseRenderer = rendererFactory.renderer(template);
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
||||||
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
|
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
|
||||||
<li class="nav-item"><a class="nav-link" href="/">Overview</a></li>
|
<li class="nav-item"><a class="nav-link" href="/">Overview</a></li>
|
||||||
|
{{#unless hideMarginaliaApp}}
|
||||||
<li class="nav-item dropdown">
|
<li class="nav-item dropdown">
|
||||||
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Application</a>
|
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Application</a>
|
||||||
<ul class="dropdown-menu">
|
<ul class="dropdown-menu">
|
||||||
@ -18,6 +19,7 @@
|
|||||||
<li><a class="dropdown-item" href="/review-random-domains" title="Review random domains list">Random Exploration</a></li>
|
<li><a class="dropdown-item" href="/review-random-domains" title="Review random domains list">Random Exploration</a></li>
|
||||||
</ul>
|
</ul>
|
||||||
</li>
|
</li>
|
||||||
|
{{/unless}}
|
||||||
<li class="nav-item dropdown">
|
<li class="nav-item dropdown">
|
||||||
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Index Nodes</a>
|
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Index Nodes</a>
|
||||||
<ul class="dropdown-menu">
|
<ul class="dropdown-menu">
|
||||||
|
@ -1,24 +1,37 @@
|
|||||||
# System Properties
|
# System Properties
|
||||||
|
|
||||||
These are JVM system properties used by each service
|
These are JVM system properties used by each service. These properties can either
|
||||||
|
be loaded from a file or passed in as command line arguments, using `$JAVA_OPTS`.
|
||||||
|
|
||||||
## Search Service
|
The system will look for a properties file in `conf/properties/system.properties`,
|
||||||
| flag | values | description |
|
within the install dir, as specified by `$WMSA_HOME`.
|
||||||
|-------------|------------|-------------------------------------------------------|
|
|
||||||
| website-url |https://search.marginalia.nu/|Overrides the website URL used in rendering|
|
|
||||||
|
|
||||||
## Crawler Process
|
A template is available in [../run/template/conf/properties/system.properties](../run/template/conf/properties/system.properties).
|
||||||
|flag| values | description |
|
## Global
|
||||||
|---|------------|-------------------------------------------------------|
|
|
||||||
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
|
|
||||||
|
|
||||||
## Converter Process
|
| flag | values | description |
|
||||||
|flag| values | description |
|
|-------------|------------|--------------------------------------|
|
||||||
|---|------------|-------------------------------------------------------|
|
| blacklist.disable | boolean | Disables the IP blacklist |
|
||||||
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
|
| flyway.disable | boolean | Disables automatic Flyway migrations |
|
||||||
|
|
||||||
## Loader Process
|
## Crawler Properties
|
||||||
|flag| values | description |
|
|
||||||
|---|------------|-------------------------------------------------------|
|
| flag | values | description |
|
||||||
|local-index-path| /some/path | Selects the location the loader will write index data |
|
|-----------------------------|------------|------------------------------------------------------------------------------------------|
|
||||||
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
|
| crawler.userAgentString | string | Sets the user agent string used by the crawler |
|
||||||
|
| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt |
|
||||||
|
| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM |
|
||||||
|
| ip-blocklist.disabled | boolean | Disables the IP blocklist |
|
||||||
|
|
||||||
|
## Converter Properties
|
||||||
|
|
||||||
|
| flag | values | description |
|
||||||
|
|-----------------------------|------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| converter.sideloadThreshold | integer | Threshold value, in number of documents per domain, where a simpler processing method is used which uses less RAM. 10,000 is a good value for ~32GB RAM |
|
||||||
|
|
||||||
|
# Marginalia Application Specific
|
||||||
|
|
||||||
|
| flag | values | description |
|
||||||
|
|---------------------------|------------|---------------------------------------------------------------|
|
||||||
|
| search.websiteUrl | string | Overrides the website URL used in rendering |
|
||||||
|
| control.hideMarginaliaApp | boolean | Hides the Marginalia application from the control GUI results |
|
||||||
|
1
run/env/service.env
vendored
1
run/env/service.env
vendored
@ -6,4 +6,3 @@ CONVERTER_PROCESS_OPTS="-Dservice-name=converter -Dservice-host=0.0.0.0"
|
|||||||
CRAWLER_PROCESS_OPTS="-Dservice-name=crawler -Dservice-host=0.0.0.0"
|
CRAWLER_PROCESS_OPTS="-Dservice-name=crawler -Dservice-host=0.0.0.0"
|
||||||
LOADER_PROCESS_OPTS="-Dservice-name=loader -Dservice-host=0.0.0.0"
|
LOADER_PROCESS_OPTS="-Dservice-name=loader -Dservice-host=0.0.0.0"
|
||||||
INDEX_CONSTRUCTION_PROCESS_OPTS="-Dservice-name=index-constructor -Djava.util.concurrent.ForkJoinPool.common.parallelism=4"
|
INDEX_CONSTRUCTION_PROCESS_OPTS="-Dservice-name=index-constructor -Djava.util.concurrent.ForkJoinPool.common.parallelism=4"
|
||||||
SEARCH_SERVICE_OPTS="-Dwebsite-url=http://localhost:8080"
|
|
13
run/template/conf/properties/system.properties
Normal file
13
run/template/conf/properties/system.properties
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
crawler.userAgentString = Mozilla/5.0 (compatible)
|
||||||
|
crawler.userAgentIdentifier = GoogleBot
|
||||||
|
crawler.poolSize = 256
|
||||||
|
|
||||||
|
search.websiteUrl = https://localhost:8080/
|
||||||
|
|
||||||
|
executor.uploadDir = /uploads
|
||||||
|
converter.sideloadThreshold = 10000
|
||||||
|
|
||||||
|
ip-blocklist.disabled = false
|
||||||
|
blacklist.disable = false
|
||||||
|
flyway.disable = false
|
||||||
|
control.hideMarginaliaApp = false
|
@ -1 +0,0 @@
|
|||||||
PoorlyConfiguredWebCrawler
|
|
Loading…
Reference in New Issue
Block a user