Make User-agent configurable.

This commit is contained in:
vlofgren 2022-06-01 14:46:51 +02:00
parent 80dad31753
commit d8d0c0e5b2
7 changed files with 29 additions and 107 deletions

View File

@ -3,6 +3,8 @@
mkdir -p /var/lib/wmsa/conf/
mkdir -p /var/lib/wmsa/data/
echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent
cat > /var/lib/wmsa/conf/db.properties <<EOF
db.user=wmsa
db.pass=wmsa

View File

@ -0,0 +1,5 @@
package nu.marginalia.wmsa.configuration;
public record UserAgent(String uaString) {
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.configuration;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
@ -8,6 +9,16 @@ import java.util.Properties;
public class WmsaHome {
private static final String DEFAULT = "/var/lib/wmsa";
public static UserAgent getUserAgent() throws IOException {
var uaPath = getHomePath().resolve("conf/user-agent");
if (!Files.exists(uaPath)) {
throw new FileNotFoundException("Could not find " + uaPath);
}
return new UserAgent(Files.readString(uaPath).trim());
}
public static Path getHomePath() {
var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT));
if (!Files.isDirectory(ret)) {

View File

@ -2,44 +2,26 @@ package nu.marginalia.wmsa.configuration.module;
import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Objects;
import static com.google.inject.name.Names.named;
public class ConfigurationModule extends AbstractModule {
private static final String SERVICE_NAME = System.getProperty("service-name");
public static final int MONITOR_PORT = Integer.getInteger("monitor.port", 5000);
public static final String MONITOR_HOST = System.getProperty("monitor.host", "127.0.0.1");
public void configure() {
bind(Integer.class).annotatedWith(named("monitor-port")).toInstance(MONITOR_PORT);
bind(String.class).annotatedWith(named("monitor-host")).toInstance(MONITOR_HOST);
bind(Integer.class).annotatedWith(named("monitor-boot-timeout")).toInstance(10);
bind(String.class).annotatedWith(named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME));
bind(String.class).annotatedWith(named("service-host")).toProvider(HostnameProvider.class).in(Singleton.class);
bind(Integer.class).annotatedWith(named("service-port")).toProvider(PortProvider.class).in(Singleton.class);
bind(Integer.class).annotatedWith(named("metrics-server-port")).toProvider(MetricsPortProvider.class).in(Singleton.class);
bind(String.class).annotatedWith(named("service-host")).toInstance(System.getProperty("service-host", "127.0.0.1"));
bind(Integer.class).annotatedWith(named("service-port")).toInstance(ServiceDescriptor.byName(System.getProperty("service-name")).port);
}
@Provides
@Named("build-version")
@SneakyThrows
public String buildVersion() {
try (var str = ClassLoader.getSystemResourceAsStream("_version.txt")) {
if (null == str) {
System.err.println("Missing _version.txt from classpath");
return LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);
}
return new String(str.readAllBytes());
}
@Named("metrics-server-port")
public Integer provideMetricsServerPort(@Named("service-port") Integer servicePort) {
return servicePort + 1000;
}
}

View File

@ -1,36 +0,0 @@
package nu.marginalia.wmsa.configuration.module;
import com.google.inject.name.Named;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Provider;
public class HostnameProvider implements Provider<String> {
private static final String DEFAULT_HOSTNAME = "127.0.0.1";
private final int monitorPort;
private final String monitorHost;
private final int timeout;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public HostnameProvider(@Named("monitor-port") Integer monitorPort,
@Named("monitor-host") String monitorHost,
@Named("monitor-boot-timeout") Integer timeout
) {
this.monitorHost = monitorHost;
this.monitorPort = monitorPort;
this.timeout = timeout;
}
@Override
public String get() {
var override = System.getProperty("service-host");
if (null != override) {
return override;
}
return DEFAULT_HOSTNAME;
}
}

View File

@ -1,46 +0,0 @@
package nu.marginalia.wmsa.configuration.module;
import com.google.inject.name.Named;
import io.reactivex.rxjava3.core.Flowable;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import org.apache.http.HttpResponse;
import org.reactivestreams.Publisher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Provider;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
public class PortProvider implements Provider<Integer> {
private static final Integer DEFAULT_PORT = 5000;
private final int monitorPort;
private final String monitorHost;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final int timeout = 10;
@Inject
public PortProvider(@Named("monitor-port") Integer monitorPort,
@Named("monitor-host") String monitorHost,
@Named("monitor-boot-timeout") Integer timeout) {
this.monitorHost = monitorHost;
this.monitorPort = monitorPort;
}
@Override
public Integer get() {
return ServiceDescriptor.byName(System.getProperty("service-name")).port;
}
private Publisher<?> repeatDelay(Flowable<Throwable> error) {
return error.delay(1, TimeUnit.SECONDS);
}
private String accept200(HttpResponse rsp) throws IOException {
if (rsp.getStatusLine().getStatusCode() != 200) {
throw new RuntimeException("Monitor responded unexpected status "
+ rsp.getStatusLine().getStatusCode());
}
return new String(rsp.getEntity().getContent().readAllBytes());
}
}

View File

@ -2,6 +2,8 @@ package nu.marginalia.wmsa.edge.crawling;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import nu.marginalia.wmsa.configuration.UserAgent;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
@ -34,10 +36,12 @@ public class CrawlerMain implements AutoCloseable {
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
private final UserAgent userAgent;
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
this.inputSpec = plan.getJobSpec();
this.numberOfThreads = 512;
this.userAgent = WmsaHome.getUserAgent();
workLog = new WorkLog(plan.crawl.getLogFile());
domainWriter = new CrawledDomainWriter(plan.crawl.getDir());
@ -88,7 +92,7 @@ public class CrawlerMain implements AutoCloseable {
if (workLog.isJobFinished(specification.id))
return null;
var fetcher = new HttpFetcher("search.marginalia.nu", dispatcher);
var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
try {
var retreiver = new CrawlerRetreiver(fetcher, specification);