mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Make User-agent configurable.
This commit is contained in:
parent
80dad31753
commit
d8d0c0e5b2
@ -3,6 +3,8 @@
|
||||
mkdir -p /var/lib/wmsa/conf/
|
||||
mkdir -p /var/lib/wmsa/data/
|
||||
|
||||
echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent
|
||||
|
||||
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
||||
db.user=wmsa
|
||||
db.pass=wmsa
|
||||
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.wmsa.configuration;
|
||||
|
||||
public record UserAgent(String uaString) {
|
||||
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.wmsa.configuration;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -8,6 +9,16 @@ import java.util.Properties;
|
||||
public class WmsaHome {
|
||||
private static final String DEFAULT = "/var/lib/wmsa";
|
||||
|
||||
public static UserAgent getUserAgent() throws IOException {
|
||||
var uaPath = getHomePath().resolve("conf/user-agent");
|
||||
|
||||
if (!Files.exists(uaPath)) {
|
||||
throw new FileNotFoundException("Could not find " + uaPath);
|
||||
}
|
||||
|
||||
return new UserAgent(Files.readString(uaPath).trim());
|
||||
}
|
||||
|
||||
public static Path getHomePath() {
|
||||
var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT));
|
||||
if (!Files.isDirectory(ret)) {
|
||||
|
@ -2,44 +2,26 @@ package nu.marginalia.wmsa.configuration.module;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Objects;
|
||||
|
||||
import static com.google.inject.name.Names.named;
|
||||
|
||||
public class ConfigurationModule extends AbstractModule {
|
||||
private static final String SERVICE_NAME = System.getProperty("service-name");
|
||||
public static final int MONITOR_PORT = Integer.getInteger("monitor.port", 5000);
|
||||
public static final String MONITOR_HOST = System.getProperty("monitor.host", "127.0.0.1");
|
||||
|
||||
public void configure() {
|
||||
bind(Integer.class).annotatedWith(named("monitor-port")).toInstance(MONITOR_PORT);
|
||||
bind(String.class).annotatedWith(named("monitor-host")).toInstance(MONITOR_HOST);
|
||||
bind(Integer.class).annotatedWith(named("monitor-boot-timeout")).toInstance(10);
|
||||
|
||||
bind(String.class).annotatedWith(named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME));
|
||||
bind(String.class).annotatedWith(named("service-host")).toProvider(HostnameProvider.class).in(Singleton.class);
|
||||
bind(Integer.class).annotatedWith(named("service-port")).toProvider(PortProvider.class).in(Singleton.class);
|
||||
bind(Integer.class).annotatedWith(named("metrics-server-port")).toProvider(MetricsPortProvider.class).in(Singleton.class);
|
||||
|
||||
bind(String.class).annotatedWith(named("service-host")).toInstance(System.getProperty("service-host", "127.0.0.1"));
|
||||
bind(Integer.class).annotatedWith(named("service-port")).toInstance(ServiceDescriptor.byName(System.getProperty("service-name")).port);
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("build-version")
|
||||
@SneakyThrows
|
||||
public String buildVersion() {
|
||||
try (var str = ClassLoader.getSystemResourceAsStream("_version.txt")) {
|
||||
if (null == str) {
|
||||
System.err.println("Missing _version.txt from classpath");
|
||||
return LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);
|
||||
}
|
||||
return new String(str.readAllBytes());
|
||||
}
|
||||
@Named("metrics-server-port")
|
||||
public Integer provideMetricsServerPort(@Named("service-port") Integer servicePort) {
|
||||
return servicePort + 1000;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,36 +0,0 @@
|
||||
package nu.marginalia.wmsa.configuration.module;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Provider;
|
||||
|
||||
public class HostnameProvider implements Provider<String> {
|
||||
private static final String DEFAULT_HOSTNAME = "127.0.0.1";
|
||||
private final int monitorPort;
|
||||
private final String monitorHost;
|
||||
private final int timeout;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public HostnameProvider(@Named("monitor-port") Integer monitorPort,
|
||||
@Named("monitor-host") String monitorHost,
|
||||
@Named("monitor-boot-timeout") Integer timeout
|
||||
) {
|
||||
this.monitorHost = monitorHost;
|
||||
this.monitorPort = monitorPort;
|
||||
this.timeout = timeout;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String get() {
|
||||
var override = System.getProperty("service-host");
|
||||
if (null != override) {
|
||||
return override;
|
||||
}
|
||||
return DEFAULT_HOSTNAME;
|
||||
}
|
||||
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
package nu.marginalia.wmsa.configuration.module;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import io.reactivex.rxjava3.core.Flowable;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import org.apache.http.HttpResponse;
|
||||
import org.reactivestreams.Publisher;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Provider;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class PortProvider implements Provider<Integer> {
|
||||
private static final Integer DEFAULT_PORT = 5000;
|
||||
private final int monitorPort;
|
||||
private final String monitorHost;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final int timeout = 10;
|
||||
@Inject
|
||||
public PortProvider(@Named("monitor-port") Integer monitorPort,
|
||||
@Named("monitor-host") String monitorHost,
|
||||
@Named("monitor-boot-timeout") Integer timeout) {
|
||||
this.monitorHost = monitorHost;
|
||||
this.monitorPort = monitorPort;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer get() {
|
||||
return ServiceDescriptor.byName(System.getProperty("service-name")).port;
|
||||
}
|
||||
|
||||
private Publisher<?> repeatDelay(Flowable<Throwable> error) {
|
||||
return error.delay(1, TimeUnit.SECONDS);
|
||||
}
|
||||
|
||||
private String accept200(HttpResponse rsp) throws IOException {
|
||||
if (rsp.getStatusLine().getStatusCode() != 200) {
|
||||
throw new RuntimeException("Monitor responded unexpected status "
|
||||
+ rsp.getStatusLine().getStatusCode());
|
||||
}
|
||||
return new String(rsp.getEntity().getContent().readAllBytes());
|
||||
}
|
||||
}
|
@ -2,6 +2,8 @@ package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.configuration.UserAgent;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
|
||||
@ -34,10 +36,12 @@ public class CrawlerMain implements AutoCloseable {
|
||||
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
||||
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
||||
|
||||
private final UserAgent userAgent;
|
||||
|
||||
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
|
||||
this.inputSpec = plan.getJobSpec();
|
||||
this.numberOfThreads = 512;
|
||||
this.userAgent = WmsaHome.getUserAgent();
|
||||
|
||||
workLog = new WorkLog(plan.crawl.getLogFile());
|
||||
domainWriter = new CrawledDomainWriter(plan.crawl.getDir());
|
||||
@ -88,7 +92,7 @@ public class CrawlerMain implements AutoCloseable {
|
||||
if (workLog.isJobFinished(specification.id))
|
||||
return null;
|
||||
|
||||
var fetcher = new HttpFetcher("search.marginalia.nu", dispatcher);
|
||||
var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
|
||||
|
||||
try {
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification);
|
||||
|
Loading…
Reference in New Issue
Block a user