mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Make User-agent configurable.
This commit is contained in:
parent
80dad31753
commit
d8d0c0e5b2
@ -3,6 +3,8 @@
|
|||||||
mkdir -p /var/lib/wmsa/conf/
|
mkdir -p /var/lib/wmsa/conf/
|
||||||
mkdir -p /var/lib/wmsa/data/
|
mkdir -p /var/lib/wmsa/data/
|
||||||
|
|
||||||
|
echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent
|
||||||
|
|
||||||
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
||||||
db.user=wmsa
|
db.user=wmsa
|
||||||
db.pass=wmsa
|
db.pass=wmsa
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
package nu.marginalia.wmsa.configuration;
|
||||||
|
|
||||||
|
public record UserAgent(String uaString) {
|
||||||
|
|
||||||
|
}
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.wmsa.configuration;
|
package nu.marginalia.wmsa.configuration;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -8,6 +9,16 @@ import java.util.Properties;
|
|||||||
public class WmsaHome {
|
public class WmsaHome {
|
||||||
private static final String DEFAULT = "/var/lib/wmsa";
|
private static final String DEFAULT = "/var/lib/wmsa";
|
||||||
|
|
||||||
|
public static UserAgent getUserAgent() throws IOException {
|
||||||
|
var uaPath = getHomePath().resolve("conf/user-agent");
|
||||||
|
|
||||||
|
if (!Files.exists(uaPath)) {
|
||||||
|
throw new FileNotFoundException("Could not find " + uaPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new UserAgent(Files.readString(uaPath).trim());
|
||||||
|
}
|
||||||
|
|
||||||
public static Path getHomePath() {
|
public static Path getHomePath() {
|
||||||
var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT));
|
var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT));
|
||||||
if (!Files.isDirectory(ret)) {
|
if (!Files.isDirectory(ret)) {
|
||||||
|
@ -2,44 +2,26 @@ package nu.marginalia.wmsa.configuration.module;
|
|||||||
|
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import com.google.inject.Provides;
|
import com.google.inject.Provides;
|
||||||
import com.google.inject.Singleton;
|
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import lombok.SneakyThrows;
|
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||||
|
|
||||||
import java.time.LocalDateTime;
|
|
||||||
import java.time.format.DateTimeFormatter;
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
import static com.google.inject.name.Names.named;
|
import static com.google.inject.name.Names.named;
|
||||||
|
|
||||||
public class ConfigurationModule extends AbstractModule {
|
public class ConfigurationModule extends AbstractModule {
|
||||||
private static final String SERVICE_NAME = System.getProperty("service-name");
|
private static final String SERVICE_NAME = System.getProperty("service-name");
|
||||||
public static final int MONITOR_PORT = Integer.getInteger("monitor.port", 5000);
|
|
||||||
public static final String MONITOR_HOST = System.getProperty("monitor.host", "127.0.0.1");
|
|
||||||
|
|
||||||
public void configure() {
|
public void configure() {
|
||||||
bind(Integer.class).annotatedWith(named("monitor-port")).toInstance(MONITOR_PORT);
|
|
||||||
bind(String.class).annotatedWith(named("monitor-host")).toInstance(MONITOR_HOST);
|
|
||||||
bind(Integer.class).annotatedWith(named("monitor-boot-timeout")).toInstance(10);
|
|
||||||
|
|
||||||
bind(String.class).annotatedWith(named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME));
|
bind(String.class).annotatedWith(named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME));
|
||||||
bind(String.class).annotatedWith(named("service-host")).toProvider(HostnameProvider.class).in(Singleton.class);
|
bind(String.class).annotatedWith(named("service-host")).toInstance(System.getProperty("service-host", "127.0.0.1"));
|
||||||
bind(Integer.class).annotatedWith(named("service-port")).toProvider(PortProvider.class).in(Singleton.class);
|
bind(Integer.class).annotatedWith(named("service-port")).toInstance(ServiceDescriptor.byName(System.getProperty("service-name")).port);
|
||||||
bind(Integer.class).annotatedWith(named("metrics-server-port")).toProvider(MetricsPortProvider.class).in(Singleton.class);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Provides
|
@Provides
|
||||||
@Named("build-version")
|
@Named("metrics-server-port")
|
||||||
@SneakyThrows
|
public Integer provideMetricsServerPort(@Named("service-port") Integer servicePort) {
|
||||||
public String buildVersion() {
|
return servicePort + 1000;
|
||||||
try (var str = ClassLoader.getSystemResourceAsStream("_version.txt")) {
|
|
||||||
if (null == str) {
|
|
||||||
System.err.println("Missing _version.txt from classpath");
|
|
||||||
return LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);
|
|
||||||
}
|
|
||||||
return new String(str.readAllBytes());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,36 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.configuration.module;
|
|
||||||
|
|
||||||
import com.google.inject.name.Named;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import javax.inject.Inject;
|
|
||||||
import javax.inject.Provider;
|
|
||||||
|
|
||||||
public class HostnameProvider implements Provider<String> {
|
|
||||||
private static final String DEFAULT_HOSTNAME = "127.0.0.1";
|
|
||||||
private final int monitorPort;
|
|
||||||
private final String monitorHost;
|
|
||||||
private final int timeout;
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public HostnameProvider(@Named("monitor-port") Integer monitorPort,
|
|
||||||
@Named("monitor-host") String monitorHost,
|
|
||||||
@Named("monitor-boot-timeout") Integer timeout
|
|
||||||
) {
|
|
||||||
this.monitorHost = monitorHost;
|
|
||||||
this.monitorPort = monitorPort;
|
|
||||||
this.timeout = timeout;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String get() {
|
|
||||||
var override = System.getProperty("service-host");
|
|
||||||
if (null != override) {
|
|
||||||
return override;
|
|
||||||
}
|
|
||||||
return DEFAULT_HOSTNAME;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,46 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.configuration.module;
|
|
||||||
|
|
||||||
import com.google.inject.name.Named;
|
|
||||||
import io.reactivex.rxjava3.core.Flowable;
|
|
||||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
|
||||||
import org.apache.http.HttpResponse;
|
|
||||||
import org.reactivestreams.Publisher;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import javax.inject.Inject;
|
|
||||||
import javax.inject.Provider;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
public class PortProvider implements Provider<Integer> {
|
|
||||||
private static final Integer DEFAULT_PORT = 5000;
|
|
||||||
private final int monitorPort;
|
|
||||||
private final String monitorHost;
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
private final int timeout = 10;
|
|
||||||
@Inject
|
|
||||||
public PortProvider(@Named("monitor-port") Integer monitorPort,
|
|
||||||
@Named("monitor-host") String monitorHost,
|
|
||||||
@Named("monitor-boot-timeout") Integer timeout) {
|
|
||||||
this.monitorHost = monitorHost;
|
|
||||||
this.monitorPort = monitorPort;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Integer get() {
|
|
||||||
return ServiceDescriptor.byName(System.getProperty("service-name")).port;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Publisher<?> repeatDelay(Flowable<Throwable> error) {
|
|
||||||
return error.delay(1, TimeUnit.SECONDS);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String accept200(HttpResponse rsp) throws IOException {
|
|
||||||
if (rsp.getStatusLine().getStatusCode() != 200) {
|
|
||||||
throw new RuntimeException("Monitor responded unexpected status "
|
|
||||||
+ rsp.getStatusLine().getStatusCode());
|
|
||||||
}
|
|
||||||
return new String(rsp.getEntity().getContent().readAllBytes());
|
|
||||||
}
|
|
||||||
}
|
|
@ -2,6 +2,8 @@ package nu.marginalia.wmsa.edge.crawling;
|
|||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
|
import nu.marginalia.wmsa.configuration.UserAgent;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||||
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
|
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
|
||||||
@ -34,10 +36,12 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
||||||
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
||||||
|
|
||||||
|
private final UserAgent userAgent;
|
||||||
|
|
||||||
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
|
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
|
||||||
this.inputSpec = plan.getJobSpec();
|
this.inputSpec = plan.getJobSpec();
|
||||||
this.numberOfThreads = 512;
|
this.numberOfThreads = 512;
|
||||||
|
this.userAgent = WmsaHome.getUserAgent();
|
||||||
|
|
||||||
workLog = new WorkLog(plan.crawl.getLogFile());
|
workLog = new WorkLog(plan.crawl.getLogFile());
|
||||||
domainWriter = new CrawledDomainWriter(plan.crawl.getDir());
|
domainWriter = new CrawledDomainWriter(plan.crawl.getDir());
|
||||||
@ -88,7 +92,7 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
if (workLog.isJobFinished(specification.id))
|
if (workLog.isJobFinished(specification.id))
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
var fetcher = new HttpFetcher("search.marginalia.nu", dispatcher);
|
var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var retreiver = new CrawlerRetreiver(fetcher, specification);
|
var retreiver = new CrawlerRetreiver(fetcher, specification);
|
||||||
|
Loading…
Reference in New Issue
Block a user