Changes to crawler (#28)

Co-authored-by: vlofgren <vlofgren@gmail.com>
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/28
This commit is contained in:
Viktor Lofgren 2022-06-15 16:54:27 +02:00
parent 5c2f2d558f
commit a3a6b40cc3
23 changed files with 203 additions and 202 deletions

View File

@ -5,7 +5,8 @@ the [MEMEX/gemini server](https://memex.marginalia.nu), the and the [encyclopedi
The aim of the project is to develop new and alternative discovery methods for the Internet. The aim of the project is to develop new and alternative discovery methods for the Internet.
It's an experimental workshop as much as it is a public service, the overarching goal is to It's an experimental workshop as much as it is a public service, the overarching goal is to
elevate the more human, non-commercial sides of the Internet. elevate the more human, non-commercial sides of the Internet. A side-goal is to do this without
requiring datacenters and expensive enterprise hardware, to run this operation on affordable hardware.
The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu). The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu).
It is fine to mirror it on other hosts, but if you have issues or questions It is fine to mirror it on other hosts, but if you have issues or questions
@ -16,6 +17,10 @@ it wasn't developed with the intention of going open source, a lot of tests
and so on make assumptions about the directory structure, much configuration and so on make assumptions about the directory structure, much configuration
is hard coded and so on. Please stand by. A lot of the mess is fairly superficial. is hard coded and so on. Please stand by. A lot of the mess is fairly superficial.
## Documentation
Documentation is a work in progress. See the [wiki](https://git.marginalia.nu/marginalia/marginalia.nu/wiki).
## Contributing ## Contributing
The project is still being set up, but if you are interested in contributing, please contact me. The project is still being set up, but if you are interested in contributing, please contact me.

View File

@ -33,7 +33,7 @@ public abstract class E2ETestBase {
.withCopyFileToContainer(jarFile(), "/WMSA.jar") .withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh") .withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
.withExposedPorts(service.port) .withExposedPorts(service.port)
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) .withFileSystemBind(modelsPath(), "/wmsa/model", BindMode.READ_ONLY)
.withNetwork(network) .withNetwork(network)
.withNetworkAliases(service.name) .withNetworkAliases(service.name)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name))) .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))

View File

@ -3,6 +3,8 @@
mkdir -p /var/lib/wmsa/conf/ mkdir -p /var/lib/wmsa/conf/
mkdir -p /var/lib/wmsa/data/ mkdir -p /var/lib/wmsa/data/
echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent
cat > /var/lib/wmsa/conf/db.properties <<EOF cat > /var/lib/wmsa/conf/db.properties <<EOF
db.user=wmsa db.user=wmsa
db.pass=wmsa db.pass=wmsa

View File

@ -1,13 +1,15 @@
#!/bin/bash #!/bin/bash
mkdir -p /var/lib/wmsa/encyclopedia HOME=/wmsa
mkdir -p /var/lib/wmsa/conf
mkdir -p /var/lib/wmsa/index/write
mkdir -p /var/lib/wmsa/index/read
mkdir -p /backup/work/index-tmp
mkdir -p /var/log/wmsa mkdir -p ${HOME}/encyclopedia
cat > /var/lib/wmsa/suggestions.txt <<EOF mkdir -p ${HOME}/conf
mkdir -p ${HOME}/index/write
mkdir -p ${HOME}/index/read
mkdir -p ${HOME}/tmp-slow
mkdir -p ${HOME}/tmp-fast
cat > ${HOME}/suggestions.txt <<EOF
state state
three three
while while
@ -22,17 +24,22 @@ many
year year
EOF EOF
cat > /var/lib/wmsa/conf/disks.properties <<EOF cat > ${HOME}/conf/disks.properties <<EOF
encyclopedia=/var/lib/wmsa/encyclopedia encyclopedia=${HOME}/encyclopedia
index-write=${HOME}/index/write
index-read=${HOME}/index/read
tmp-slow=${HOME}/tmp-slow
tmp-fast=${HOME}/tmp-fast
EOF EOF
cat > /var/lib/wmsa/conf/db.properties <<EOF cat > ${HOME}/conf/db.properties <<EOF
db.user=wmsa db.user=wmsa
db.pass=wmsa db.pass=wmsa
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
EOF EOF
cat > /var/lib/wmsa/conf/ranking-settings.yaml <<EOF cat > ${HOME}/conf/ranking-settings.yaml <<EOF
--- ---
retro: retro:
- "%" - "%"
@ -46,7 +53,7 @@ standard:
- "%" - "%"
EOF EOF
cat > /var/lib/wmsa/conf/hosts <<EOF cat > ${HOME}/conf/hosts <<EOF
# service-name host-name # service-name host-name
resource-store resource-store resource-store resource-store
renderer renderer renderer renderer
@ -62,4 +69,4 @@ memex memex
dating dating dating dating
EOF EOF
java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1

View File

@ -0,0 +1,5 @@
package nu.marginalia.wmsa.configuration;
public record UserAgent(String uaString) {
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.wmsa.configuration;
public record WebsiteUrl(String url) {
public String withPath(String path) {
return url + path;
}
}

View File

@ -1,15 +1,31 @@
package nu.marginalia.wmsa.configuration; package nu.marginalia.wmsa.configuration;
import nu.marginalia.util.language.conf.LanguageModels;
import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Optional;
import java.util.Properties; import java.util.Properties;
public class WmsaHome { public class WmsaHome {
private static final String DEFAULT = "/var/lib/wmsa"; private static final String DEFAULT = "/var/lib/wmsa";
public static UserAgent getUserAgent() throws IOException {
var uaPath = getHomePath().resolve("conf/user-agent");
if (!Files.exists(uaPath)) {
throw new FileNotFoundException("Could not find " + uaPath);
}
return new UserAgent(Files.readString(uaPath).trim());
}
public static Path getHomePath() { public static Path getHomePath() {
var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT)); var retStr = Optional.ofNullable(System.getenv("WMSA_HOME")).orElse(DEFAULT);
var ret = Path.of(retStr);
if (!Files.isDirectory(ret)) { if (!Files.isDirectory(ret)) {
throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure " + DEFAULT + " exists"); throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure " + DEFAULT + " exists");
} }
@ -34,26 +50,44 @@ public class WmsaHome {
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV"); return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
} }
public static Path getDisk(String name) throws IOException { public static Path getDisk(String name) {
Path p = Path.of(getDiskProperties().getProperty(name)); var pathStr = getDiskProperties().getProperty(name);
if (null == pathStr) {
throw new RuntimeException("Disk " + name + " was not configured");
}
Path p = Path.of(pathStr);
if (!Files.isDirectory(p)) { if (!Files.isDirectory(p)) {
throw new IOException(name + " does not exist!"); throw new RuntimeException("Disk " + name + " does not exist or is not a directory!");
} }
return p; return p;
} }
public static Properties getDiskProperties() throws IOException { public static Properties getDiskProperties() {
Path settingsFile = getHomePath().resolve("conf/disks.properties"); Path settingsFile = getHomePath().resolve("conf/disks.properties");
if (Files.isRegularFile(settingsFile)) { if (!Files.isRegularFile(settingsFile)) {
throw new RuntimeException("Could not find disk settings " + settingsFile);
}
try (var is = Files.newInputStream(settingsFile)) { try (var is = Files.newInputStream(settingsFile)) {
var props = new Properties(); var props = new Properties();
props.load(is); props.load(is);
return props; return props;
} }
catch (IOException ex) {
throw new RuntimeException(ex);
} }
else {
throw new IOException("Could not find disk settings " + settingsFile);
} }
public static LanguageModels getLanguageModels() {
final Path home = getHomePath();
return new LanguageModels(
home.resolve("model/ngrams-generous-emstr.bin"),
home.resolve("model/tfreq-new-algo3.bin"),
home.resolve("model/opennlp-sentence.bin"),
home.resolve("model/English.RDR"),
home.resolve("model/English.DICT"),
home.resolve("model/opennlp-tok.bin"));
} }
} }

View File

@ -2,44 +2,26 @@ package nu.marginalia.wmsa.configuration.module;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import com.google.inject.Provides; import com.google.inject.Provides;
import com.google.inject.Singleton;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import lombok.SneakyThrows; import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Objects; import java.util.Objects;
import static com.google.inject.name.Names.named; import static com.google.inject.name.Names.named;
public class ConfigurationModule extends AbstractModule { public class ConfigurationModule extends AbstractModule {
private static final String SERVICE_NAME = System.getProperty("service-name"); private static final String SERVICE_NAME = System.getProperty("service-name");
public static final int MONITOR_PORT = Integer.getInteger("monitor.port", 5000);
public static final String MONITOR_HOST = System.getProperty("monitor.host", "127.0.0.1");
public void configure() { public void configure() {
bind(Integer.class).annotatedWith(named("monitor-port")).toInstance(MONITOR_PORT);
bind(String.class).annotatedWith(named("monitor-host")).toInstance(MONITOR_HOST);
bind(Integer.class).annotatedWith(named("monitor-boot-timeout")).toInstance(10);
bind(String.class).annotatedWith(named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME)); bind(String.class).annotatedWith(named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME));
bind(String.class).annotatedWith(named("service-host")).toProvider(HostnameProvider.class).in(Singleton.class); bind(String.class).annotatedWith(named("service-host")).toInstance(System.getProperty("service-host", "127.0.0.1"));
bind(Integer.class).annotatedWith(named("service-port")).toProvider(PortProvider.class).in(Singleton.class); bind(Integer.class).annotatedWith(named("service-port")).toInstance(ServiceDescriptor.byName(System.getProperty("service-name")).port);
bind(Integer.class).annotatedWith(named("metrics-server-port")).toProvider(MetricsPortProvider.class).in(Singleton.class);
} }
@Provides @Provides
@Named("build-version") @Named("metrics-server-port")
@SneakyThrows public Integer provideMetricsServerPort(@Named("service-port") Integer servicePort) {
public String buildVersion() { return servicePort + 1000;
try (var str = ClassLoader.getSystemResourceAsStream("_version.txt")) {
if (null == str) {
System.err.println("Missing _version.txt from classpath");
return LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);
}
return new String(str.readAllBytes());
}
} }
} }

View File

@ -1,36 +0,0 @@
package nu.marginalia.wmsa.configuration.module;
import com.google.inject.name.Named;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Provider;
public class HostnameProvider implements Provider<String> {
private static final String DEFAULT_HOSTNAME = "127.0.0.1";
private final int monitorPort;
private final String monitorHost;
private final int timeout;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public HostnameProvider(@Named("monitor-port") Integer monitorPort,
@Named("monitor-host") String monitorHost,
@Named("monitor-boot-timeout") Integer timeout
) {
this.monitorHost = monitorHost;
this.monitorPort = monitorPort;
this.timeout = timeout;
}
@Override
public String get() {
var override = System.getProperty("service-host");
if (null != override) {
return override;
}
return DEFAULT_HOSTNAME;
}
}

View File

@ -1,46 +0,0 @@
package nu.marginalia.wmsa.configuration.module;
import com.google.inject.name.Named;
import io.reactivex.rxjava3.core.Flowable;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import org.apache.http.HttpResponse;
import org.reactivestreams.Publisher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Provider;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
public class PortProvider implements Provider<Integer> {
private static final Integer DEFAULT_PORT = 5000;
private final int monitorPort;
private final String monitorHost;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final int timeout = 10;
@Inject
public PortProvider(@Named("monitor-port") Integer monitorPort,
@Named("monitor-host") String monitorHost,
@Named("monitor-boot-timeout") Integer timeout) {
this.monitorHost = monitorHost;
this.monitorPort = monitorPort;
}
@Override
public Integer get() {
return ServiceDescriptor.byName(System.getProperty("service-name")).port;
}
private Publisher<?> repeatDelay(Flowable<Throwable> error) {
return error.delay(1, TimeUnit.SECONDS);
}
private String accept200(HttpResponse rsp) throws IOException {
if (rsp.getStatusLine().getStatusCode() != 200) {
throw new RuntimeException("Monitor responded unexpected status "
+ rsp.getStatusLine().getStatusCode());
}
return new String(rsp.getEntity().getContent().readAllBytes());
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.assistant;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.configuration.WmsaHome;
import java.nio.file.Path; import java.nio.file.Path;
@ -9,14 +10,8 @@ import static com.google.inject.name.Names.named;
public class EdgeAssistantModule extends AbstractModule { public class EdgeAssistantModule extends AbstractModule {
public void configure() { public void configure() {
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(Path.of("/var/lib/wmsa/suggestions.txt")); bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("suggestions.txt"));
bind(LanguageModels.class).toInstance(new LanguageModels(
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"), bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
));
} }
} }

View File

@ -5,6 +5,7 @@ import com.google.inject.AbstractModule;
import com.google.inject.name.Names; import com.google.inject.name.Names;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
@ -30,14 +31,7 @@ public class ConverterModule extends AbstractModule {
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
bind(LanguageModels.class).toInstance(new LanguageModels( bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
));
} }
private Gson createGson() { private Gson createGson() {

View File

@ -185,26 +185,25 @@ public class DocumentProcessor {
} }
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
var links = doc.getElementsByTag("a");
var frames = doc.getElementsByTag("frame");
var feeds = doc.select("link[rel=alternate]");
LinkProcessor lp = new LinkProcessor(ret, baseUrl); final LinkProcessor lp = new LinkProcessor(ret, baseUrl);
for (var atag : links) { baseUrl = linkParser.getBaseLink(doc, baseUrl);
for (var atag : doc.getElementsByTag("a")) {
linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept); linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept);
} }
for (var frame : frames) { for (var frame : doc.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
} }
for (var link : feeds) { for (var link : doc.select("link[rel=alternate]")) {
feedExtractor feedExtractor
.getFeedFromAlternateTag(baseUrl, link) .getFeedFromAlternateTag(baseUrl, link)
.ifPresent(lp::acceptFeed); .ifPresent(lp::acceptFeed);
} }
Set<String> linkTerms = new HashSet<>(); final Set<String> linkTerms = new HashSet<>();
for (var domain : lp.getForeignDomains()) { for (var domain : lp.getForeignDomains()) {
linkTerms.add("links:"+domain.toString().toLowerCase()); linkTerms.add("links:"+domain.toString().toLowerCase());

View File

@ -1,9 +1,12 @@
package nu.marginalia.wmsa.edge.converting.processor.logic; package nu.marginalia.wmsa.edge.converting.processor.logic;
import com.google.common.base.CharMatcher; import com.google.common.base.CharMatcher;
import com.google.common.base.Strings;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jetbrains.annotations.Contract; import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -26,11 +29,11 @@ public class LinkParser {
".gz", ".asc", ".md5", ".asf", ".mov", ".sig", ".pub", ".iso"); ".gz", ".asc", ".md5", ".asf", ".mov", ".sig", ".pub", ".iso");
@Contract(pure=true) @Contract(pure=true)
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, Element l) { public Optional<EdgeUrl> parseLink(EdgeUrl relativeBaseUrl, Element l) {
return Optional.of(l) return Optional.of(l)
.filter(this::shouldIndexLink) .filter(this::shouldIndexLink)
.map(this::getUrl) .map(this::getUrl)
.map(link -> resolveUrl(baseUrl, link)) .map(link -> resolveUrl(relativeBaseUrl, link))
.flatMap(this::createURI) .flatMap(this::createURI)
.map(URI::normalize) .map(URI::normalize)
.map(this::renormalize) .map(this::renormalize)
@ -100,6 +103,8 @@ public class LinkParser {
} }
private static final Pattern paramRegex = Pattern.compile("\\?.*$"); private static final Pattern paramRegex = Pattern.compile("\\?.*$");
private static final Pattern spaceRegex = Pattern.compile(" ");
@SneakyThrows @SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) { private String resolveUrl(EdgeUrl baseUrl, String s) {
s = paramRegex.matcher(s).replaceAll(""); s = paramRegex.matcher(s).replaceAll("");
@ -111,10 +116,12 @@ public class LinkParser {
// url looks like /my-page // url looks like /my-page
if (s.startsWith("/")) { if (s.startsWith("/")) {
return baseUrl.sibling(s).toString(); return baseUrl.withPath(s).toString();
} }
return baseUrl.sibling(relativeNavigation(baseUrl) + s.replaceAll(" ", "%20")).toString(); final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20");
return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString();
} }
// for a relative url that looks like /foo or /foo/bar; return / or /foo // for a relative url that looks like /foo or /foo/bar; return / or /foo
@ -162,4 +169,23 @@ public class LinkParser {
} }
return true; return true;
} }
@Nullable
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
var baseTags = parsed.getElementsByTag("base");
try {
for (var tag : baseTags) {
String href = tag.attr("href");
if (!Strings.isNullOrEmpty(href)) {
return new EdgeUrl(resolveUrl(documentUrl, href));
}
}
}
catch (Exception ex) {
logger.warn("Failed to parse <base href=...>, falling back to document url");
}
return documentUrl;
}
} }

View File

@ -2,6 +2,8 @@ package nu.marginalia.wmsa.edge.crawling;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.gson.GsonBuilder; import com.google.gson.GsonBuilder;
import nu.marginalia.wmsa.configuration.UserAgent;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver; import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
@ -34,10 +36,12 @@ public class CrawlerMain implements AutoCloseable {
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
private final UserAgent userAgent;
public CrawlerMain(EdgeCrawlPlan plan) throws Exception { public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
this.inputSpec = plan.getJobSpec(); this.inputSpec = plan.getJobSpec();
this.numberOfThreads = 512; this.numberOfThreads = 512;
this.userAgent = WmsaHome.getUserAgent();
workLog = new WorkLog(plan.crawl.getLogFile()); workLog = new WorkLog(plan.crawl.getLogFile());
domainWriter = new CrawledDomainWriter(plan.crawl.getDir()); domainWriter = new CrawledDomainWriter(plan.crawl.getDir());
@ -88,7 +92,7 @@ public class CrawlerMain implements AutoCloseable {
if (workLog.isJobFinished(specification.id)) if (workLog.isJobFinished(specification.id))
return null; return null;
var fetcher = new HttpFetcher("search.marginalia.nu", dispatcher); var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
try { try {
var retreiver = new CrawlerRetreiver(fetcher, specification); var retreiver = new CrawlerRetreiver(fetcher, specification);

View File

@ -202,10 +202,11 @@ public class CrawlerRetreiver {
return domain.equals(url.domain.toString().toLowerCase()); return domain.equals(url.domain.toString().toLowerCase());
} }
private void findLinks(EdgeUrl url, Document parsed) { private void findLinks(EdgeUrl baseUrl, Document parsed) {
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
for (var link : parsed.getElementsByTag("a")) { for (var link : parsed.getElementsByTag("a")) {
linkParser.parseLink(url, link) linkParser.parseLink(baseUrl, link)
.filter(this::isSameDomain) .filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isForumLink(u)) .filter(u -> !urlBlocklist.isForumLink(u))
@ -213,7 +214,7 @@ public class CrawlerRetreiver {
.ifPresent(queue::addLast); .ifPresent(queue::addLast);
} }
for (var link : parsed.getElementsByTag("frame")) { for (var link : parsed.getElementsByTag("frame")) {
linkParser.parseFrame(url, link) linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain) .filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isForumLink(u)) .filter(u -> !urlBlocklist.isForumLink(u))
@ -221,7 +222,7 @@ public class CrawlerRetreiver {
.ifPresent(queue::addLast); .ifPresent(queue::addLast);
} }
for (var link : parsed.getElementsByTag("iframe")) { for (var link : parsed.getElementsByTag("iframe")) {
linkParser.parseFrame(url, link) linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain) .filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isForumLink(u)) .filter(u -> !urlBlocklist.isForumLink(u))
@ -230,10 +231,11 @@ public class CrawlerRetreiver {
} }
} }
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl url, Document parsed) { private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
baseUrl = baseUrl.withPath("/");
for (var link : parsed.select("link[rel=canonical]")) { for (var link : parsed.select("link[rel=canonical]")) {
return linkParser.parseLink(url, link); return linkParser.parseLink(baseUrl, link);
} }
return Optional.empty(); return Optional.empty();

View File

@ -2,17 +2,18 @@ package nu.marginalia.wmsa.edge.index;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import com.google.inject.name.Names; import com.google.inject.name.Names;
import nu.marginalia.wmsa.configuration.WmsaHome;
import java.nio.file.Path; import java.nio.file.Path;
public class EdgeTablesModule extends AbstractModule { public class EdgeTablesModule extends AbstractModule {
public void configure() { public void configure() {
bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(Path.of("/var/lib/wmsa/index/write")); bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(WmsaHome.getDisk("index-write"));
bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(Path.of("/backup/work/index-tmp/")); bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(WmsaHome.getDisk("index-read"));
bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(Path.of("/var/lib/wmsa/index/read")); bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(WmsaHome.getDisk("tmp-slow"));
bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(Path.of("/var/lib/wmsa/index/read")); bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(WmsaHome.getDisk("tmp-fast"));
bind(String.class).annotatedWith(Names.named("edge-writer-page-index-file")).toInstance("page-index.dat"); bind(String.class).annotatedWith(Names.named("edge-writer-page-index-file")).toInstance("page-index.dat");
bind(String.class).annotatedWith(Names.named("edge-writer-dictionary-file")).toInstance("dictionary.dat"); bind(String.class).annotatedWith(Names.named("edge-writer-dictionary-file")).toInstance("dictionary.dat");

View File

@ -21,6 +21,7 @@ public class EdgeDomain implements WideHashable {
@SneakyThrows @SneakyThrows
public EdgeDomain(String host) { public EdgeDomain(String host) {
Objects.requireNonNull(host, "domain name must not be null");
var dot = host.lastIndexOf('.'); var dot = host.lastIndexOf('.');

View File

@ -79,11 +79,6 @@ public class EdgeUrl implements WideHashable {
this.port = port(URI.getPort(), proto); this.port = port(URI.getPort(), proto);
} }
public EdgeUrl sibling(String newPath) {
return new EdgeUrl(proto, domain, port, newPath);
}
private static Integer port(Integer port, String protocol) { private static Integer port(Integer port, String protocol) {
if (null == port || port < 1) { if (null == port || port < 1) {
return null; return null;
@ -120,5 +115,7 @@ public class EdgeUrl implements WideHashable {
return (int) path.chars().filter(c -> c=='/').count(); return (int) path.chars().filter(c -> c=='/').count();
} }
public EdgeUrl withPath(String s) {
return new EdgeUrl(proto, domain, port, s);
}
} }

View File

@ -2,21 +2,14 @@ package nu.marginalia.wmsa.edge.search;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.configuration.WebsiteUrl;
import java.nio.file.Path; import nu.marginalia.wmsa.configuration.WmsaHome;
public class EdgeSearchModule extends AbstractModule { public class EdgeSearchModule extends AbstractModule {
public void configure() { public void configure() {
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
bind(LanguageModels.class).toInstance(new LanguageModels( bind(WebsiteUrl.class).toInstance(new WebsiteUrl(System.getProperty("website-url", "https://search.marginalia.nu/")));
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
));
} }
} }

View File

@ -8,6 +8,7 @@ import com.google.inject.name.Named;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.api.model.ApiSearchResult; import nu.marginalia.wmsa.api.model.ApiSearchResult;
import nu.marginalia.wmsa.api.model.ApiSearchResults; import nu.marginalia.wmsa.api.model.ApiSearchResults;
import nu.marginalia.wmsa.configuration.WebsiteUrl;
import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.MetricsServer;
@ -34,7 +35,7 @@ public class EdgeSearchService extends Service {
private final EdgeIndexClient indexClient; private final EdgeIndexClient indexClient;
private final EdgeSearchOperator searchOperator; private final EdgeSearchOperator searchOperator;
private final CommandEvaluator searchCommandEvaulator; private final CommandEvaluator searchCommandEvaulator;
private final WebsiteUrl websiteUrl;
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class); private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class);
@SneakyThrows @SneakyThrows
@ -45,13 +46,14 @@ public class EdgeSearchService extends Service {
Initialization initialization, Initialization initialization,
MetricsServer metricsServer, MetricsServer metricsServer,
EdgeSearchOperator searchOperator, EdgeSearchOperator searchOperator,
CommandEvaluator searchCommandEvaulator CommandEvaluator searchCommandEvaulator,
) { WebsiteUrl websiteUrl) {
super(ip, port, initialization, metricsServer); super(ip, port, initialization, metricsServer);
this.indexClient = indexClient; this.indexClient = indexClient;
this.searchOperator = searchOperator; this.searchOperator = searchOperator;
this.searchCommandEvaulator = searchCommandEvaulator; this.searchCommandEvaulator = searchCommandEvaulator;
this.websiteUrl = websiteUrl;
Spark.staticFiles.expireTime(600); Spark.staticFiles.expireTime(600);
@ -79,7 +81,7 @@ public class EdgeSearchService extends Service {
final String query = URLEncoder.encode(String.format("%s site:%s", queryRaw, site), StandardCharsets.UTF_8); final String query = URLEncoder.encode(String.format("%s site:%s", queryRaw, site), StandardCharsets.UTF_8);
final String profile = request.queryParamOrDefault("profile", "yolo"); final String profile = request.queryParamOrDefault("profile", "yolo");
response.redirect("https://search.marginalia.nu/search?query="+query+"&profile="+profile); response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile));
return null; return null;
} }
@ -141,7 +143,7 @@ public class EdgeSearchService extends Service {
final String queryParam = request.queryParams("query"); final String queryParam = request.queryParams("query");
if (null == queryParam || queryParam.isBlank()) { if (null == queryParam || queryParam.isBlank()) {
response.redirect("https://search.marginalia.nu/"); response.redirect(websiteUrl.url());
return null; return null;
} }

View File

@ -7,7 +7,6 @@ import java.nio.file.Path;
public class ResourceStoreModule extends AbstractModule { public class ResourceStoreModule extends AbstractModule {
public void configure() { public void configure() {
bind(String.class).annotatedWith(Names.named("external-url")).toInstance("https://reddit.marginalia.nu/");
bind(Path.class).annotatedWith(Names.named("data-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/resources")); bind(Path.class).annotatedWith(Names.named("data-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/resources"));
} }

View File

@ -11,9 +11,8 @@ import static org.junit.jupiter.api.Assertions.*;
class LinkParserTest { class LinkParserTest {
private String parseLink(String href, String base) throws URISyntaxException { private String parseLink(String href, String relBase) throws URISyntaxException {
var url = new EdgeUrl("http://www.marginalia.nu/" + base); var url = new EdgeUrl("http://www.marginalia.nu/" + relBase);
var domain = url.domain;
var parser = new LinkParser(); var parser = new LinkParser();
var stuff = Jsoup.parseBodyFragment("<a href='"+href+"''>test</a>"); var stuff = Jsoup.parseBodyFragment("<a href='"+href+"''>test</a>");
var lnk = parser.parseLink( var lnk = parser.parseLink(
@ -43,6 +42,7 @@ class LinkParserTest {
void testAnchor() throws URISyntaxException { void testAnchor() throws URISyntaxException {
assertNull(parseLink("#test", "/")); assertNull(parseLink("#test", "/"));
} }
@Test @Test
void testRelative() throws URISyntaxException { void testRelative() throws URISyntaxException {
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/")); assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/"));
@ -51,4 +51,32 @@ class LinkParserTest {
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/foo/index.html")); assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/foo/index.html"));
assertEquals("http://www.marginalia.nu/test", parseLink("/test", "/foo/index.html")); assertEquals("http://www.marginalia.nu/test", parseLink("/test", "/foo/index.html"));
} }
private EdgeUrl getBaseUrl(String href, EdgeUrl documentUrl) {
LinkParser lp = new LinkParser();
return lp.getBaseLink(Jsoup.parse("<base href=\"" + href + "\" />"), documentUrl);
}
@Test
public void getBaseUrlTest() throws URISyntaxException {
assertEquals(new EdgeUrl("https://www.marginalia.nu/base"),
getBaseUrl("/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar")));
assertEquals(new EdgeUrl("https://memex.marginalia.nu/base"),
getBaseUrl("https://memex.marginalia.nu/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar")));
assertEquals(new EdgeUrl("https://www.marginalia.nu/test/base"),
getBaseUrl("base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar")));
}
@Test
public void testParseBadBaseLink() throws URISyntaxException {
LinkParser lp = new LinkParser();
var url = new EdgeUrl("https://memex.marginalia.nu/");
assertEquals(url, lp.getBaseLink(Jsoup.parse("<base href/>"), url));
assertEquals(url, lp.getBaseLink(Jsoup.parse("<base target=\"foo\"/>"), url));
assertEquals(url, lp.getBaseLink(Jsoup.parse("<base href=\"http://\"/>"), url));
}
} }