mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Changes to crawler (#28)
Co-authored-by: vlofgren <vlofgren@gmail.com> Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/28
This commit is contained in:
parent
5c2f2d558f
commit
a3a6b40cc3
@ -5,7 +5,8 @@ the [MEMEX/gemini server](https://memex.marginalia.nu), the and the [encyclopedi
|
|||||||
|
|
||||||
The aim of the project is to develop new and alternative discovery methods for the Internet.
|
The aim of the project is to develop new and alternative discovery methods for the Internet.
|
||||||
It's an experimental workshop as much as it is a public service, the overarching goal is to
|
It's an experimental workshop as much as it is a public service, the overarching goal is to
|
||||||
elevate the more human, non-commercial sides of the Internet.
|
elevate the more human, non-commercial sides of the Internet. A side-goal is to do this without
|
||||||
|
requiring datacenters and expensive enterprise hardware, to run this operation on affordable hardware.
|
||||||
|
|
||||||
The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu).
|
The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu).
|
||||||
It is fine to mirror it on other hosts, but if you have issues or questions
|
It is fine to mirror it on other hosts, but if you have issues or questions
|
||||||
@ -16,6 +17,10 @@ it wasn't developed with the intention of going open source, a lot of tests
|
|||||||
and so on make assumptions about the directory structure, much configuration
|
and so on make assumptions about the directory structure, much configuration
|
||||||
is hard coded and so on. Please stand by. A lot of the mess is fairly superficial.
|
is hard coded and so on. Please stand by. A lot of the mess is fairly superficial.
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
Documentation is a work in progress. See the [wiki](https://git.marginalia.nu/marginalia/marginalia.nu/wiki).
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
The project is still being set up, but if you are interested in contributing, please contact me.
|
The project is still being set up, but if you are interested in contributing, please contact me.
|
||||||
|
@ -33,7 +33,7 @@ public abstract class E2ETestBase {
|
|||||||
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
||||||
.withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
|
.withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
|
||||||
.withExposedPorts(service.port)
|
.withExposedPorts(service.port)
|
||||||
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
|
.withFileSystemBind(modelsPath(), "/wmsa/model", BindMode.READ_ONLY)
|
||||||
.withNetwork(network)
|
.withNetwork(network)
|
||||||
.withNetworkAliases(service.name)
|
.withNetworkAliases(service.name)
|
||||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))
|
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
mkdir -p /var/lib/wmsa/conf/
|
mkdir -p /var/lib/wmsa/conf/
|
||||||
mkdir -p /var/lib/wmsa/data/
|
mkdir -p /var/lib/wmsa/data/
|
||||||
|
|
||||||
|
echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent
|
||||||
|
|
||||||
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
||||||
db.user=wmsa
|
db.user=wmsa
|
||||||
db.pass=wmsa
|
db.pass=wmsa
|
||||||
|
@ -1,13 +1,15 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
mkdir -p /var/lib/wmsa/encyclopedia
|
HOME=/wmsa
|
||||||
mkdir -p /var/lib/wmsa/conf
|
|
||||||
mkdir -p /var/lib/wmsa/index/write
|
|
||||||
mkdir -p /var/lib/wmsa/index/read
|
|
||||||
mkdir -p /backup/work/index-tmp
|
|
||||||
|
|
||||||
mkdir -p /var/log/wmsa
|
mkdir -p ${HOME}/encyclopedia
|
||||||
cat > /var/lib/wmsa/suggestions.txt <<EOF
|
mkdir -p ${HOME}/conf
|
||||||
|
mkdir -p ${HOME}/index/write
|
||||||
|
mkdir -p ${HOME}/index/read
|
||||||
|
mkdir -p ${HOME}/tmp-slow
|
||||||
|
mkdir -p ${HOME}/tmp-fast
|
||||||
|
|
||||||
|
cat > ${HOME}/suggestions.txt <<EOF
|
||||||
state
|
state
|
||||||
three
|
three
|
||||||
while
|
while
|
||||||
@ -22,17 +24,22 @@ many
|
|||||||
year
|
year
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
cat > /var/lib/wmsa/conf/disks.properties <<EOF
|
cat > ${HOME}/conf/disks.properties <<EOF
|
||||||
encyclopedia=/var/lib/wmsa/encyclopedia
|
encyclopedia=${HOME}/encyclopedia
|
||||||
|
|
||||||
|
index-write=${HOME}/index/write
|
||||||
|
index-read=${HOME}/index/read
|
||||||
|
tmp-slow=${HOME}/tmp-slow
|
||||||
|
tmp-fast=${HOME}/tmp-fast
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
cat > ${HOME}/conf/db.properties <<EOF
|
||||||
db.user=wmsa
|
db.user=wmsa
|
||||||
db.pass=wmsa
|
db.pass=wmsa
|
||||||
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
|
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
cat > /var/lib/wmsa/conf/ranking-settings.yaml <<EOF
|
cat > ${HOME}/conf/ranking-settings.yaml <<EOF
|
||||||
---
|
---
|
||||||
retro:
|
retro:
|
||||||
- "%"
|
- "%"
|
||||||
@ -46,7 +53,7 @@ standard:
|
|||||||
- "%"
|
- "%"
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
cat > /var/lib/wmsa/conf/hosts <<EOF
|
cat > ${HOME}/conf/hosts <<EOF
|
||||||
# service-name host-name
|
# service-name host-name
|
||||||
resource-store resource-store
|
resource-store resource-store
|
||||||
renderer renderer
|
renderer renderer
|
||||||
@ -62,4 +69,4 @@ memex memex
|
|||||||
dating dating
|
dating dating
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
|
WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
|
@ -0,0 +1,5 @@
|
|||||||
|
package nu.marginalia.wmsa.configuration;
|
||||||
|
|
||||||
|
public record UserAgent(String uaString) {
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,7 @@
|
|||||||
|
package nu.marginalia.wmsa.configuration;
|
||||||
|
|
||||||
|
public record WebsiteUrl(String url) {
|
||||||
|
public String withPath(String path) {
|
||||||
|
return url + path;
|
||||||
|
}
|
||||||
|
}
|
@ -1,15 +1,31 @@
|
|||||||
package nu.marginalia.wmsa.configuration;
|
package nu.marginalia.wmsa.configuration;
|
||||||
|
|
||||||
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
|
||||||
public class WmsaHome {
|
public class WmsaHome {
|
||||||
private static final String DEFAULT = "/var/lib/wmsa";
|
private static final String DEFAULT = "/var/lib/wmsa";
|
||||||
|
|
||||||
|
public static UserAgent getUserAgent() throws IOException {
|
||||||
|
var uaPath = getHomePath().resolve("conf/user-agent");
|
||||||
|
|
||||||
|
if (!Files.exists(uaPath)) {
|
||||||
|
throw new FileNotFoundException("Could not find " + uaPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new UserAgent(Files.readString(uaPath).trim());
|
||||||
|
}
|
||||||
|
|
||||||
public static Path getHomePath() {
|
public static Path getHomePath() {
|
||||||
var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT));
|
var retStr = Optional.ofNullable(System.getenv("WMSA_HOME")).orElse(DEFAULT);
|
||||||
|
|
||||||
|
var ret = Path.of(retStr);
|
||||||
if (!Files.isDirectory(ret)) {
|
if (!Files.isDirectory(ret)) {
|
||||||
throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure " + DEFAULT + " exists");
|
throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure " + DEFAULT + " exists");
|
||||||
}
|
}
|
||||||
@ -34,26 +50,44 @@ public class WmsaHome {
|
|||||||
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
|
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Path getDisk(String name) throws IOException {
|
public static Path getDisk(String name) {
|
||||||
Path p = Path.of(getDiskProperties().getProperty(name));
|
var pathStr = getDiskProperties().getProperty(name);
|
||||||
|
if (null == pathStr) {
|
||||||
|
throw new RuntimeException("Disk " + name + " was not configured");
|
||||||
|
}
|
||||||
|
Path p = Path.of(pathStr);
|
||||||
if (!Files.isDirectory(p)) {
|
if (!Files.isDirectory(p)) {
|
||||||
throw new IOException(name + " does not exist!");
|
throw new RuntimeException("Disk " + name + " does not exist or is not a directory!");
|
||||||
}
|
}
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Properties getDiskProperties() throws IOException {
|
public static Properties getDiskProperties() {
|
||||||
Path settingsFile = getHomePath().resolve("conf/disks.properties");
|
Path settingsFile = getHomePath().resolve("conf/disks.properties");
|
||||||
|
|
||||||
if (Files.isRegularFile(settingsFile)) {
|
if (!Files.isRegularFile(settingsFile)) {
|
||||||
|
throw new RuntimeException("Could not find disk settings " + settingsFile);
|
||||||
|
}
|
||||||
|
|
||||||
try (var is = Files.newInputStream(settingsFile)) {
|
try (var is = Files.newInputStream(settingsFile)) {
|
||||||
var props = new Properties();
|
var props = new Properties();
|
||||||
props.load(is);
|
props.load(is);
|
||||||
return props;
|
return props;
|
||||||
}
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
throw new IOException("Could not find disk settings " + settingsFile);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static LanguageModels getLanguageModels() {
|
||||||
|
final Path home = getHomePath();
|
||||||
|
|
||||||
|
return new LanguageModels(
|
||||||
|
home.resolve("model/ngrams-generous-emstr.bin"),
|
||||||
|
home.resolve("model/tfreq-new-algo3.bin"),
|
||||||
|
home.resolve("model/opennlp-sentence.bin"),
|
||||||
|
home.resolve("model/English.RDR"),
|
||||||
|
home.resolve("model/English.DICT"),
|
||||||
|
home.resolve("model/opennlp-tok.bin"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,44 +2,26 @@ package nu.marginalia.wmsa.configuration.module;
|
|||||||
|
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import com.google.inject.Provides;
|
import com.google.inject.Provides;
|
||||||
import com.google.inject.Singleton;
|
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import lombok.SneakyThrows;
|
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||||
|
|
||||||
import java.time.LocalDateTime;
|
|
||||||
import java.time.format.DateTimeFormatter;
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
import static com.google.inject.name.Names.named;
|
import static com.google.inject.name.Names.named;
|
||||||
|
|
||||||
public class ConfigurationModule extends AbstractModule {
|
public class ConfigurationModule extends AbstractModule {
|
||||||
private static final String SERVICE_NAME = System.getProperty("service-name");
|
private static final String SERVICE_NAME = System.getProperty("service-name");
|
||||||
public static final int MONITOR_PORT = Integer.getInteger("monitor.port", 5000);
|
|
||||||
public static final String MONITOR_HOST = System.getProperty("monitor.host", "127.0.0.1");
|
|
||||||
|
|
||||||
public void configure() {
|
public void configure() {
|
||||||
bind(Integer.class).annotatedWith(named("monitor-port")).toInstance(MONITOR_PORT);
|
|
||||||
bind(String.class).annotatedWith(named("monitor-host")).toInstance(MONITOR_HOST);
|
|
||||||
bind(Integer.class).annotatedWith(named("monitor-boot-timeout")).toInstance(10);
|
|
||||||
|
|
||||||
bind(String.class).annotatedWith(named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME));
|
bind(String.class).annotatedWith(named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME));
|
||||||
bind(String.class).annotatedWith(named("service-host")).toProvider(HostnameProvider.class).in(Singleton.class);
|
bind(String.class).annotatedWith(named("service-host")).toInstance(System.getProperty("service-host", "127.0.0.1"));
|
||||||
bind(Integer.class).annotatedWith(named("service-port")).toProvider(PortProvider.class).in(Singleton.class);
|
bind(Integer.class).annotatedWith(named("service-port")).toInstance(ServiceDescriptor.byName(System.getProperty("service-name")).port);
|
||||||
bind(Integer.class).annotatedWith(named("metrics-server-port")).toProvider(MetricsPortProvider.class).in(Singleton.class);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Provides
|
@Provides
|
||||||
@Named("build-version")
|
@Named("metrics-server-port")
|
||||||
@SneakyThrows
|
public Integer provideMetricsServerPort(@Named("service-port") Integer servicePort) {
|
||||||
public String buildVersion() {
|
return servicePort + 1000;
|
||||||
try (var str = ClassLoader.getSystemResourceAsStream("_version.txt")) {
|
|
||||||
if (null == str) {
|
|
||||||
System.err.println("Missing _version.txt from classpath");
|
|
||||||
return LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);
|
|
||||||
}
|
|
||||||
return new String(str.readAllBytes());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,36 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.configuration.module;
|
|
||||||
|
|
||||||
import com.google.inject.name.Named;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import javax.inject.Inject;
|
|
||||||
import javax.inject.Provider;
|
|
||||||
|
|
||||||
public class HostnameProvider implements Provider<String> {
|
|
||||||
private static final String DEFAULT_HOSTNAME = "127.0.0.1";
|
|
||||||
private final int monitorPort;
|
|
||||||
private final String monitorHost;
|
|
||||||
private final int timeout;
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public HostnameProvider(@Named("monitor-port") Integer monitorPort,
|
|
||||||
@Named("monitor-host") String monitorHost,
|
|
||||||
@Named("monitor-boot-timeout") Integer timeout
|
|
||||||
) {
|
|
||||||
this.monitorHost = monitorHost;
|
|
||||||
this.monitorPort = monitorPort;
|
|
||||||
this.timeout = timeout;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String get() {
|
|
||||||
var override = System.getProperty("service-host");
|
|
||||||
if (null != override) {
|
|
||||||
return override;
|
|
||||||
}
|
|
||||||
return DEFAULT_HOSTNAME;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,46 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.configuration.module;
|
|
||||||
|
|
||||||
import com.google.inject.name.Named;
|
|
||||||
import io.reactivex.rxjava3.core.Flowable;
|
|
||||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
|
||||||
import org.apache.http.HttpResponse;
|
|
||||||
import org.reactivestreams.Publisher;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import javax.inject.Inject;
|
|
||||||
import javax.inject.Provider;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
public class PortProvider implements Provider<Integer> {
|
|
||||||
private static final Integer DEFAULT_PORT = 5000;
|
|
||||||
private final int monitorPort;
|
|
||||||
private final String monitorHost;
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
private final int timeout = 10;
|
|
||||||
@Inject
|
|
||||||
public PortProvider(@Named("monitor-port") Integer monitorPort,
|
|
||||||
@Named("monitor-host") String monitorHost,
|
|
||||||
@Named("monitor-boot-timeout") Integer timeout) {
|
|
||||||
this.monitorHost = monitorHost;
|
|
||||||
this.monitorPort = monitorPort;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Integer get() {
|
|
||||||
return ServiceDescriptor.byName(System.getProperty("service-name")).port;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Publisher<?> repeatDelay(Flowable<Throwable> error) {
|
|
||||||
return error.delay(1, TimeUnit.SECONDS);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String accept200(HttpResponse rsp) throws IOException {
|
|
||||||
if (rsp.getStatusLine().getStatusCode() != 200) {
|
|
||||||
throw new RuntimeException("Monitor responded unexpected status "
|
|
||||||
+ rsp.getStatusLine().getStatusCode());
|
|
||||||
}
|
|
||||||
return new String(rsp.getEntity().getContent().readAllBytes());
|
|
||||||
}
|
|
||||||
}
|
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.assistant;
|
|||||||
|
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
@ -9,14 +10,8 @@ import static com.google.inject.name.Names.named;
|
|||||||
|
|
||||||
public class EdgeAssistantModule extends AbstractModule {
|
public class EdgeAssistantModule extends AbstractModule {
|
||||||
public void configure() {
|
public void configure() {
|
||||||
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(Path.of("/var/lib/wmsa/suggestions.txt"));
|
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("suggestions.txt"));
|
||||||
bind(LanguageModels.class).toInstance(new LanguageModels(
|
|
||||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||||
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/English.RDR"),
|
|
||||||
Path.of("/var/lib/wmsa/model/English.DICT"),
|
|
||||||
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ import com.google.inject.AbstractModule;
|
|||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
@ -30,14 +31,7 @@ public class ConverterModule extends AbstractModule {
|
|||||||
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
||||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||||
|
|
||||||
bind(LanguageModels.class).toInstance(new LanguageModels(
|
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/English.RDR"),
|
|
||||||
Path.of("/var/lib/wmsa/model/English.DICT"),
|
|
||||||
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Gson createGson() {
|
private Gson createGson() {
|
||||||
|
@ -185,26 +185,25 @@ public class DocumentProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
||||||
var links = doc.getElementsByTag("a");
|
|
||||||
var frames = doc.getElementsByTag("frame");
|
|
||||||
var feeds = doc.select("link[rel=alternate]");
|
|
||||||
|
|
||||||
LinkProcessor lp = new LinkProcessor(ret, baseUrl);
|
final LinkProcessor lp = new LinkProcessor(ret, baseUrl);
|
||||||
|
|
||||||
for (var atag : links) {
|
baseUrl = linkParser.getBaseLink(doc, baseUrl);
|
||||||
|
|
||||||
|
for (var atag : doc.getElementsByTag("a")) {
|
||||||
linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept);
|
linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
for (var frame : frames) {
|
for (var frame : doc.getElementsByTag("frame")) {
|
||||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var link : feeds) {
|
for (var link : doc.select("link[rel=alternate]")) {
|
||||||
feedExtractor
|
feedExtractor
|
||||||
.getFeedFromAlternateTag(baseUrl, link)
|
.getFeedFromAlternateTag(baseUrl, link)
|
||||||
.ifPresent(lp::acceptFeed);
|
.ifPresent(lp::acceptFeed);
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<String> linkTerms = new HashSet<>();
|
final Set<String> linkTerms = new HashSet<>();
|
||||||
|
|
||||||
for (var domain : lp.getForeignDomains()) {
|
for (var domain : lp.getForeignDomains()) {
|
||||||
linkTerms.add("links:"+domain.toString().toLowerCase());
|
linkTerms.add("links:"+domain.toString().toLowerCase());
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
import com.google.common.base.CharMatcher;
|
import com.google.common.base.CharMatcher;
|
||||||
|
import com.google.common.base.Strings;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.jetbrains.annotations.Contract;
|
import org.jetbrains.annotations.Contract;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -26,11 +29,11 @@ public class LinkParser {
|
|||||||
".gz", ".asc", ".md5", ".asf", ".mov", ".sig", ".pub", ".iso");
|
".gz", ".asc", ".md5", ".asf", ".mov", ".sig", ".pub", ".iso");
|
||||||
|
|
||||||
@Contract(pure=true)
|
@Contract(pure=true)
|
||||||
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, Element l) {
|
public Optional<EdgeUrl> parseLink(EdgeUrl relativeBaseUrl, Element l) {
|
||||||
return Optional.of(l)
|
return Optional.of(l)
|
||||||
.filter(this::shouldIndexLink)
|
.filter(this::shouldIndexLink)
|
||||||
.map(this::getUrl)
|
.map(this::getUrl)
|
||||||
.map(link -> resolveUrl(baseUrl, link))
|
.map(link -> resolveUrl(relativeBaseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -100,6 +103,8 @@ public class LinkParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern paramRegex = Pattern.compile("\\?.*$");
|
private static final Pattern paramRegex = Pattern.compile("\\?.*$");
|
||||||
|
private static final Pattern spaceRegex = Pattern.compile(" ");
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
||||||
s = paramRegex.matcher(s).replaceAll("");
|
s = paramRegex.matcher(s).replaceAll("");
|
||||||
@ -111,10 +116,12 @@ public class LinkParser {
|
|||||||
|
|
||||||
// url looks like /my-page
|
// url looks like /my-page
|
||||||
if (s.startsWith("/")) {
|
if (s.startsWith("/")) {
|
||||||
return baseUrl.sibling(s).toString();
|
return baseUrl.withPath(s).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
return baseUrl.sibling(relativeNavigation(baseUrl) + s.replaceAll(" ", "%20")).toString();
|
final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20");
|
||||||
|
|
||||||
|
return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
// for a relative url that looks like /foo or /foo/bar; return / or /foo
|
// for a relative url that looks like /foo or /foo/bar; return / or /foo
|
||||||
@ -162,4 +169,23 @@ public class LinkParser {
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
||||||
|
var baseTags = parsed.getElementsByTag("base");
|
||||||
|
|
||||||
|
try {
|
||||||
|
for (var tag : baseTags) {
|
||||||
|
String href = tag.attr("href");
|
||||||
|
if (!Strings.isNullOrEmpty(href)) {
|
||||||
|
return new EdgeUrl(resolveUrl(documentUrl, href));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.warn("Failed to parse <base href=...>, falling back to document url");
|
||||||
|
}
|
||||||
|
|
||||||
|
return documentUrl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,8 @@ package nu.marginalia.wmsa.edge.crawling;
|
|||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
|
import nu.marginalia.wmsa.configuration.UserAgent;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||||
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
|
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
|
||||||
@ -34,10 +36,12 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
||||||
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
||||||
|
|
||||||
|
private final UserAgent userAgent;
|
||||||
|
|
||||||
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
|
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
|
||||||
this.inputSpec = plan.getJobSpec();
|
this.inputSpec = plan.getJobSpec();
|
||||||
this.numberOfThreads = 512;
|
this.numberOfThreads = 512;
|
||||||
|
this.userAgent = WmsaHome.getUserAgent();
|
||||||
|
|
||||||
workLog = new WorkLog(plan.crawl.getLogFile());
|
workLog = new WorkLog(plan.crawl.getLogFile());
|
||||||
domainWriter = new CrawledDomainWriter(plan.crawl.getDir());
|
domainWriter = new CrawledDomainWriter(plan.crawl.getDir());
|
||||||
@ -88,7 +92,7 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
if (workLog.isJobFinished(specification.id))
|
if (workLog.isJobFinished(specification.id))
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
var fetcher = new HttpFetcher("search.marginalia.nu", dispatcher);
|
var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var retreiver = new CrawlerRetreiver(fetcher, specification);
|
var retreiver = new CrawlerRetreiver(fetcher, specification);
|
||||||
|
@ -202,10 +202,11 @@ public class CrawlerRetreiver {
|
|||||||
return domain.equals(url.domain.toString().toLowerCase());
|
return domain.equals(url.domain.toString().toLowerCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void findLinks(EdgeUrl url, Document parsed) {
|
private void findLinks(EdgeUrl baseUrl, Document parsed) {
|
||||||
|
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
|
||||||
|
|
||||||
for (var link : parsed.getElementsByTag("a")) {
|
for (var link : parsed.getElementsByTag("a")) {
|
||||||
linkParser.parseLink(url, link)
|
linkParser.parseLink(baseUrl, link)
|
||||||
.filter(this::isSameDomain)
|
.filter(this::isSameDomain)
|
||||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||||
@ -213,7 +214,7 @@ public class CrawlerRetreiver {
|
|||||||
.ifPresent(queue::addLast);
|
.ifPresent(queue::addLast);
|
||||||
}
|
}
|
||||||
for (var link : parsed.getElementsByTag("frame")) {
|
for (var link : parsed.getElementsByTag("frame")) {
|
||||||
linkParser.parseFrame(url, link)
|
linkParser.parseFrame(baseUrl, link)
|
||||||
.filter(this::isSameDomain)
|
.filter(this::isSameDomain)
|
||||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||||
@ -221,7 +222,7 @@ public class CrawlerRetreiver {
|
|||||||
.ifPresent(queue::addLast);
|
.ifPresent(queue::addLast);
|
||||||
}
|
}
|
||||||
for (var link : parsed.getElementsByTag("iframe")) {
|
for (var link : parsed.getElementsByTag("iframe")) {
|
||||||
linkParser.parseFrame(url, link)
|
linkParser.parseFrame(baseUrl, link)
|
||||||
.filter(this::isSameDomain)
|
.filter(this::isSameDomain)
|
||||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||||
@ -230,10 +231,11 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl url, Document parsed) {
|
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
|
||||||
|
baseUrl = baseUrl.withPath("/");
|
||||||
|
|
||||||
for (var link : parsed.select("link[rel=canonical]")) {
|
for (var link : parsed.select("link[rel=canonical]")) {
|
||||||
return linkParser.parseLink(url, link);
|
return linkParser.parseLink(baseUrl, link);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
@ -2,17 +2,18 @@ package nu.marginalia.wmsa.edge.index;
|
|||||||
|
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
public class EdgeTablesModule extends AbstractModule {
|
public class EdgeTablesModule extends AbstractModule {
|
||||||
|
|
||||||
public void configure() {
|
public void configure() {
|
||||||
bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(Path.of("/var/lib/wmsa/index/write"));
|
bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(WmsaHome.getDisk("index-write"));
|
||||||
bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(Path.of("/backup/work/index-tmp/"));
|
bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(WmsaHome.getDisk("index-read"));
|
||||||
|
|
||||||
bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(Path.of("/var/lib/wmsa/index/read"));
|
bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(WmsaHome.getDisk("tmp-slow"));
|
||||||
bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(Path.of("/var/lib/wmsa/index/read"));
|
bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(WmsaHome.getDisk("tmp-fast"));
|
||||||
|
|
||||||
bind(String.class).annotatedWith(Names.named("edge-writer-page-index-file")).toInstance("page-index.dat");
|
bind(String.class).annotatedWith(Names.named("edge-writer-page-index-file")).toInstance("page-index.dat");
|
||||||
bind(String.class).annotatedWith(Names.named("edge-writer-dictionary-file")).toInstance("dictionary.dat");
|
bind(String.class).annotatedWith(Names.named("edge-writer-dictionary-file")).toInstance("dictionary.dat");
|
||||||
|
@ -21,6 +21,7 @@ public class EdgeDomain implements WideHashable {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public EdgeDomain(String host) {
|
public EdgeDomain(String host) {
|
||||||
|
Objects.requireNonNull(host, "domain name must not be null");
|
||||||
|
|
||||||
var dot = host.lastIndexOf('.');
|
var dot = host.lastIndexOf('.');
|
||||||
|
|
||||||
|
@ -79,11 +79,6 @@ public class EdgeUrl implements WideHashable {
|
|||||||
this.port = port(URI.getPort(), proto);
|
this.port = port(URI.getPort(), proto);
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeUrl sibling(String newPath) {
|
|
||||||
return new EdgeUrl(proto, domain, port, newPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static Integer port(Integer port, String protocol) {
|
private static Integer port(Integer port, String protocol) {
|
||||||
if (null == port || port < 1) {
|
if (null == port || port < 1) {
|
||||||
return null;
|
return null;
|
||||||
@ -120,5 +115,7 @@ public class EdgeUrl implements WideHashable {
|
|||||||
return (int) path.chars().filter(c -> c=='/').count();
|
return (int) path.chars().filter(c -> c=='/').count();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public EdgeUrl withPath(String s) {
|
||||||
|
return new EdgeUrl(proto, domain, port, s);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,21 +2,14 @@ package nu.marginalia.wmsa.edge.search;
|
|||||||
|
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.configuration.WebsiteUrl;
|
||||||
import java.nio.file.Path;
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
|
||||||
public class EdgeSearchModule extends AbstractModule {
|
public class EdgeSearchModule extends AbstractModule {
|
||||||
|
|
||||||
public void configure() {
|
public void configure() {
|
||||||
|
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||||
bind(LanguageModels.class).toInstance(new LanguageModels(
|
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(System.getProperty("website-url", "https://search.marginalia.nu/")));
|
||||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/English.RDR"),
|
|
||||||
Path.of("/var/lib/wmsa/model/English.DICT"),
|
|
||||||
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,7 @@ import com.google.inject.name.Named;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.api.model.ApiSearchResult;
|
import nu.marginalia.wmsa.api.model.ApiSearchResult;
|
||||||
import nu.marginalia.wmsa.api.model.ApiSearchResults;
|
import nu.marginalia.wmsa.api.model.ApiSearchResults;
|
||||||
|
import nu.marginalia.wmsa.configuration.WebsiteUrl;
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||||
@ -34,7 +35,7 @@ public class EdgeSearchService extends Service {
|
|||||||
private final EdgeIndexClient indexClient;
|
private final EdgeIndexClient indexClient;
|
||||||
private final EdgeSearchOperator searchOperator;
|
private final EdgeSearchOperator searchOperator;
|
||||||
private final CommandEvaluator searchCommandEvaulator;
|
private final CommandEvaluator searchCommandEvaulator;
|
||||||
|
private final WebsiteUrl websiteUrl;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class);
|
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class);
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@ -45,13 +46,14 @@ public class EdgeSearchService extends Service {
|
|||||||
Initialization initialization,
|
Initialization initialization,
|
||||||
MetricsServer metricsServer,
|
MetricsServer metricsServer,
|
||||||
EdgeSearchOperator searchOperator,
|
EdgeSearchOperator searchOperator,
|
||||||
CommandEvaluator searchCommandEvaulator
|
CommandEvaluator searchCommandEvaulator,
|
||||||
) {
|
WebsiteUrl websiteUrl) {
|
||||||
super(ip, port, initialization, metricsServer);
|
super(ip, port, initialization, metricsServer);
|
||||||
this.indexClient = indexClient;
|
this.indexClient = indexClient;
|
||||||
|
|
||||||
this.searchOperator = searchOperator;
|
this.searchOperator = searchOperator;
|
||||||
this.searchCommandEvaulator = searchCommandEvaulator;
|
this.searchCommandEvaulator = searchCommandEvaulator;
|
||||||
|
this.websiteUrl = websiteUrl;
|
||||||
|
|
||||||
Spark.staticFiles.expireTime(600);
|
Spark.staticFiles.expireTime(600);
|
||||||
|
|
||||||
@ -79,7 +81,7 @@ public class EdgeSearchService extends Service {
|
|||||||
final String query = URLEncoder.encode(String.format("%s site:%s", queryRaw, site), StandardCharsets.UTF_8);
|
final String query = URLEncoder.encode(String.format("%s site:%s", queryRaw, site), StandardCharsets.UTF_8);
|
||||||
final String profile = request.queryParamOrDefault("profile", "yolo");
|
final String profile = request.queryParamOrDefault("profile", "yolo");
|
||||||
|
|
||||||
response.redirect("https://search.marginalia.nu/search?query="+query+"&profile="+profile);
|
response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile));
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -141,7 +143,7 @@ public class EdgeSearchService extends Service {
|
|||||||
|
|
||||||
final String queryParam = request.queryParams("query");
|
final String queryParam = request.queryParams("query");
|
||||||
if (null == queryParam || queryParam.isBlank()) {
|
if (null == queryParam || queryParam.isBlank()) {
|
||||||
response.redirect("https://search.marginalia.nu/");
|
response.redirect(websiteUrl.url());
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,6 @@ import java.nio.file.Path;
|
|||||||
|
|
||||||
public class ResourceStoreModule extends AbstractModule {
|
public class ResourceStoreModule extends AbstractModule {
|
||||||
public void configure() {
|
public void configure() {
|
||||||
bind(String.class).annotatedWith(Names.named("external-url")).toInstance("https://reddit.marginalia.nu/");
|
|
||||||
bind(Path.class).annotatedWith(Names.named("data-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/resources"));
|
bind(Path.class).annotatedWith(Names.named("data-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/resources"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,9 +11,8 @@ import static org.junit.jupiter.api.Assertions.*;
|
|||||||
|
|
||||||
class LinkParserTest {
|
class LinkParserTest {
|
||||||
|
|
||||||
private String parseLink(String href, String base) throws URISyntaxException {
|
private String parseLink(String href, String relBase) throws URISyntaxException {
|
||||||
var url = new EdgeUrl("http://www.marginalia.nu/" + base);
|
var url = new EdgeUrl("http://www.marginalia.nu/" + relBase);
|
||||||
var domain = url.domain;
|
|
||||||
var parser = new LinkParser();
|
var parser = new LinkParser();
|
||||||
var stuff = Jsoup.parseBodyFragment("<a href='"+href+"''>test</a>");
|
var stuff = Jsoup.parseBodyFragment("<a href='"+href+"''>test</a>");
|
||||||
var lnk = parser.parseLink(
|
var lnk = parser.parseLink(
|
||||||
@ -43,6 +42,7 @@ class LinkParserTest {
|
|||||||
void testAnchor() throws URISyntaxException {
|
void testAnchor() throws URISyntaxException {
|
||||||
assertNull(parseLink("#test", "/"));
|
assertNull(parseLink("#test", "/"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testRelative() throws URISyntaxException {
|
void testRelative() throws URISyntaxException {
|
||||||
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/"));
|
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/"));
|
||||||
@ -51,4 +51,32 @@ class LinkParserTest {
|
|||||||
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/foo/index.html"));
|
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/foo/index.html"));
|
||||||
assertEquals("http://www.marginalia.nu/test", parseLink("/test", "/foo/index.html"));
|
assertEquals("http://www.marginalia.nu/test", parseLink("/test", "/foo/index.html"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private EdgeUrl getBaseUrl(String href, EdgeUrl documentUrl) {
|
||||||
|
LinkParser lp = new LinkParser();
|
||||||
|
|
||||||
|
return lp.getBaseLink(Jsoup.parse("<base href=\"" + href + "\" />"), documentUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void getBaseUrlTest() throws URISyntaxException {
|
||||||
|
assertEquals(new EdgeUrl("https://www.marginalia.nu/base"),
|
||||||
|
getBaseUrl("/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar")));
|
||||||
|
|
||||||
|
assertEquals(new EdgeUrl("https://memex.marginalia.nu/base"),
|
||||||
|
getBaseUrl("https://memex.marginalia.nu/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar")));
|
||||||
|
|
||||||
|
assertEquals(new EdgeUrl("https://www.marginalia.nu/test/base"),
|
||||||
|
getBaseUrl("base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testParseBadBaseLink() throws URISyntaxException {
|
||||||
|
LinkParser lp = new LinkParser();
|
||||||
|
var url = new EdgeUrl("https://memex.marginalia.nu/");
|
||||||
|
|
||||||
|
assertEquals(url, lp.getBaseLink(Jsoup.parse("<base href/>"), url));
|
||||||
|
assertEquals(url, lp.getBaseLink(Jsoup.parse("<base target=\"foo\"/>"), url));
|
||||||
|
assertEquals(url, lp.getBaseLink(Jsoup.parse("<base href=\"http://\"/>"), url));
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user