From a3a6b40cc3e67e6273f939dfea73322868b4926d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 15 Jun 2022 16:54:27 +0200 Subject: [PATCH] Changes to crawler (#28) Co-authored-by: vlofgren Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/28 --- README.md | 9 ++- .../nu/marginalia/wmsa/edge/E2ETestBase.java | 2 +- marginalia_nu/src/e2e/resources/crawl.sh | 2 + marginalia_nu/src/e2e/resources/init.sh | 33 ++++++---- .../wmsa/configuration/UserAgent.java | 5 ++ .../wmsa/configuration/WebsiteUrl.java | 7 +++ .../wmsa/configuration/WmsaHome.java | 60 +++++++++++++++---- .../module/ConfigurationModule.java | 30 ++-------- .../module/HostnameProvider.java | 36 ----------- .../configuration/module/PortProvider.java | 46 -------------- .../edge/assistant/EdgeAssistantModule.java | 13 ++-- .../wmsa/edge/converting/ConverterModule.java | 10 +--- .../processor/DocumentProcessor.java | 17 +++--- .../processor/logic/LinkParser.java | 34 +++++++++-- .../wmsa/edge/crawling/CrawlerMain.java | 6 +- .../crawling/retreival/CrawlerRetreiver.java | 14 +++-- .../wmsa/edge/index/EdgeTablesModule.java | 9 +-- .../wmsa/edge/model/EdgeDomain.java | 1 + .../marginalia/wmsa/edge/model/EdgeUrl.java | 9 +-- .../wmsa/edge/search/EdgeSearchModule.java | 15 ++--- .../wmsa/edge/search/EdgeSearchService.java | 12 ++-- .../resource_store/ResourceStoreModule.java | 1 - .../wmsa/edge/crawling/LinkParserTest.java | 34 ++++++++++- 23 files changed, 203 insertions(+), 202 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/UserAgent.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WebsiteUrl.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/HostnameProvider.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/PortProvider.java diff --git a/README.md b/README.md index cfe88bc9..927fff6f 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,8 @@ the [MEMEX/gemini server](https://memex.marginalia.nu), the and the [encyclopedi The aim of the project is to develop new and alternative discovery methods for the Internet. It's an experimental workshop as much as it is a public service, the overarching goal is to -elevate the more human, non-commercial sides of the Internet. +elevate the more human, non-commercial sides of the Internet. A side-goal is to do this without +requiring datacenters and expensive enterprise hardware, to run this operation on affordable hardware. The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu). It is fine to mirror it on other hosts, but if you have issues or questions @@ -16,6 +17,10 @@ it wasn't developed with the intention of going open source, a lot of tests and so on make assumptions about the directory structure, much configuration is hard coded and so on. Please stand by. A lot of the mess is fairly superficial. +## Documentation + +Documentation is a work in progress. See the [wiki](https://git.marginalia.nu/marginalia/marginalia.nu/wiki). + ## Contributing The project is still being set up, but if you are interested in contributing, please contact me. @@ -26,4 +31,4 @@ Consider [supporting this project](https://memex.marginalia.nu/projects/edge/sup ## Contact -You can email with any questions or feedback. \ No newline at end of file +You can email with any questions or feedback. diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java index 769eca40..0c329a79 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java @@ -33,7 +33,7 @@ public abstract class E2ETestBase { .withCopyFileToContainer(jarFile(), "/WMSA.jar") .withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh") .withExposedPorts(service.port) - .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) + .withFileSystemBind(modelsPath(), "/wmsa/model", BindMode.READ_ONLY) .withNetwork(network) .withNetworkAliases(service.name) .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name))) diff --git a/marginalia_nu/src/e2e/resources/crawl.sh b/marginalia_nu/src/e2e/resources/crawl.sh index 3a0e4b01..16d43fab 100644 --- a/marginalia_nu/src/e2e/resources/crawl.sh +++ b/marginalia_nu/src/e2e/resources/crawl.sh @@ -3,6 +3,8 @@ mkdir -p /var/lib/wmsa/conf/ mkdir -p /var/lib/wmsa/data/ +echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent + cat > /var/lib/wmsa/conf/db.properties < /var/lib/wmsa/suggestions.txt < ${HOME}/suggestions.txt < /var/lib/wmsa/conf/disks.properties < ${HOME}/conf/disks.properties < /var/lib/wmsa/conf/db.properties < ${HOME}/conf/db.properties < /var/lib/wmsa/conf/ranking-settings.yaml < ${HOME}/conf/ranking-settings.yaml < /var/lib/wmsa/conf/hosts < ${HOME}/conf/hosts < { - private static final String DEFAULT_HOSTNAME = "127.0.0.1"; - private final int monitorPort; - private final String monitorHost; - private final int timeout; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Inject - public HostnameProvider(@Named("monitor-port") Integer monitorPort, - @Named("monitor-host") String monitorHost, - @Named("monitor-boot-timeout") Integer timeout - ) { - this.monitorHost = monitorHost; - this.monitorPort = monitorPort; - this.timeout = timeout; - } - - @Override - public String get() { - var override = System.getProperty("service-host"); - if (null != override) { - return override; - } - return DEFAULT_HOSTNAME; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/PortProvider.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/PortProvider.java deleted file mode 100644 index 7286aa68..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/PortProvider.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.wmsa.configuration.module; - -import com.google.inject.name.Named; -import io.reactivex.rxjava3.core.Flowable; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import org.apache.http.HttpResponse; -import org.reactivestreams.Publisher; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.inject.Inject; -import javax.inject.Provider; -import java.io.IOException; -import java.util.concurrent.TimeUnit; - -public class PortProvider implements Provider { - private static final Integer DEFAULT_PORT = 5000; - private final int monitorPort; - private final String monitorHost; - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final int timeout = 10; - @Inject - public PortProvider(@Named("monitor-port") Integer monitorPort, - @Named("monitor-host") String monitorHost, - @Named("monitor-boot-timeout") Integer timeout) { - this.monitorHost = monitorHost; - this.monitorPort = monitorPort; - } - - @Override - public Integer get() { - return ServiceDescriptor.byName(System.getProperty("service-name")).port; - } - - private Publisher repeatDelay(Flowable error) { - return error.delay(1, TimeUnit.SECONDS); - } - - private String accept200(HttpResponse rsp) throws IOException { - if (rsp.getStatusLine().getStatusCode() != 200) { - throw new RuntimeException("Monitor responded unexpected status " - + rsp.getStatusLine().getStatusCode()); - } - return new String(rsp.getEntity().getContent().readAllBytes()); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantModule.java index cc5c3fe6..dcc8d90d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantModule.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.assistant; import com.google.inject.AbstractModule; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.configuration.WmsaHome; import java.nio.file.Path; @@ -9,14 +10,8 @@ import static com.google.inject.name.Names.named; public class EdgeAssistantModule extends AbstractModule { public void configure() { - bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(Path.of("/var/lib/wmsa/suggestions.txt")); - bind(LanguageModels.class).toInstance(new LanguageModels( - Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"), - Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"), - Path.of("/var/lib/wmsa/model/English.RDR"), - Path.of("/var/lib/wmsa/model/English.DICT"), - Path.of("/var/lib/wmsa/model/opennlp-tok.bin") - )); + bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("suggestions.txt")); + + bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java index 6f03632f..4bf6eaea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java @@ -5,6 +5,7 @@ import com.google.inject.AbstractModule; import com.google.inject.name.Names; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; @@ -30,14 +31,7 @@ public class ConverterModule extends AbstractModule { bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); - bind(LanguageModels.class).toInstance(new LanguageModels( - Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"), - Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"), - Path.of("/var/lib/wmsa/model/English.RDR"), - Path.of("/var/lib/wmsa/model/English.DICT"), - Path.of("/var/lib/wmsa/model/opennlp-tok.bin") - )); + bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); } private Gson createGson() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index ce6393f2..b205cdea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -185,26 +185,25 @@ public class DocumentProcessor { } private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { - var links = doc.getElementsByTag("a"); - var frames = doc.getElementsByTag("frame"); - var feeds = doc.select("link[rel=alternate]"); - LinkProcessor lp = new LinkProcessor(ret, baseUrl); + final LinkProcessor lp = new LinkProcessor(ret, baseUrl); - for (var atag : links) { + baseUrl = linkParser.getBaseLink(doc, baseUrl); + + for (var atag : doc.getElementsByTag("a")) { linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept); } - for (var frame : frames) { + for (var frame : doc.getElementsByTag("frame")) { linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); } - for (var link : feeds) { + for (var link : doc.select("link[rel=alternate]")) { feedExtractor - .getFeedFromAlternateTag(baseUrl, link) + .getFeedFromAlternateTag(baseUrl, link) .ifPresent(lp::acceptFeed); } - Set linkTerms = new HashSet<>(); + final Set linkTerms = new HashSet<>(); for (var domain : lp.getForeignDomains()) { linkTerms.add("links:"+domain.toString().toLowerCase()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index aedaf0f7..378182f2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -1,9 +1,12 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import com.google.common.base.CharMatcher; +import com.google.common.base.Strings; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jetbrains.annotations.Contract; +import org.jetbrains.annotations.Nullable; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,11 +29,11 @@ public class LinkParser { ".gz", ".asc", ".md5", ".asf", ".mov", ".sig", ".pub", ".iso"); @Contract(pure=true) - public Optional parseLink(EdgeUrl baseUrl, Element l) { + public Optional parseLink(EdgeUrl relativeBaseUrl, Element l) { return Optional.of(l) .filter(this::shouldIndexLink) .map(this::getUrl) - .map(link -> resolveUrl(baseUrl, link)) + .map(link -> resolveUrl(relativeBaseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -100,6 +103,8 @@ public class LinkParser { } private static final Pattern paramRegex = Pattern.compile("\\?.*$"); + private static final Pattern spaceRegex = Pattern.compile(" "); + @SneakyThrows private String resolveUrl(EdgeUrl baseUrl, String s) { s = paramRegex.matcher(s).replaceAll(""); @@ -111,10 +116,12 @@ public class LinkParser { // url looks like /my-page if (s.startsWith("/")) { - return baseUrl.sibling(s).toString(); + return baseUrl.withPath(s).toString(); } - return baseUrl.sibling(relativeNavigation(baseUrl) + s.replaceAll(" ", "%20")).toString(); + final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20"); + + return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString(); } // for a relative url that looks like /foo or /foo/bar; return / or /foo @@ -162,4 +169,23 @@ public class LinkParser { } return true; } + + @Nullable + public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) { + var baseTags = parsed.getElementsByTag("base"); + + try { + for (var tag : baseTags) { + String href = tag.attr("href"); + if (!Strings.isNullOrEmpty(href)) { + return new EdgeUrl(resolveUrl(documentUrl, href)); + } + } + } + catch (Exception ex) { + logger.warn("Failed to parse , falling back to document url"); + } + + return documentUrl; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java index d81e348b..7238dce0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java @@ -2,6 +2,8 @@ package nu.marginalia.wmsa.edge.crawling; import com.google.gson.Gson; import com.google.gson.GsonBuilder; +import nu.marginalia.wmsa.configuration.UserAgent; +import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver; @@ -34,10 +36,12 @@ public class CrawlerMain implements AutoCloseable { private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); + private final UserAgent userAgent; public CrawlerMain(EdgeCrawlPlan plan) throws Exception { this.inputSpec = plan.getJobSpec(); this.numberOfThreads = 512; + this.userAgent = WmsaHome.getUserAgent(); workLog = new WorkLog(plan.crawl.getLogFile()); domainWriter = new CrawledDomainWriter(plan.crawl.getDir()); @@ -88,7 +92,7 @@ public class CrawlerMain implements AutoCloseable { if (workLog.isJobFinished(specification.id)) return null; - var fetcher = new HttpFetcher("search.marginalia.nu", dispatcher); + var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher); try { var retreiver = new CrawlerRetreiver(fetcher, specification); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index a7c08a24..2b27ed4d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -202,10 +202,11 @@ public class CrawlerRetreiver { return domain.equals(url.domain.toString().toLowerCase()); } - private void findLinks(EdgeUrl url, Document parsed) { + private void findLinks(EdgeUrl baseUrl, Document parsed) { + baseUrl = linkParser.getBaseLink(parsed, baseUrl); for (var link : parsed.getElementsByTag("a")) { - linkParser.parseLink(url, link) + linkParser.parseLink(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isForumLink(u)) @@ -213,7 +214,7 @@ public class CrawlerRetreiver { .ifPresent(queue::addLast); } for (var link : parsed.getElementsByTag("frame")) { - linkParser.parseFrame(url, link) + linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isForumLink(u)) @@ -221,7 +222,7 @@ public class CrawlerRetreiver { .ifPresent(queue::addLast); } for (var link : parsed.getElementsByTag("iframe")) { - linkParser.parseFrame(url, link) + linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isForumLink(u)) @@ -230,10 +231,11 @@ public class CrawlerRetreiver { } } - private Optional findCanonicalUrl(EdgeUrl url, Document parsed) { + private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { + baseUrl = baseUrl.withPath("/"); for (var link : parsed.select("link[rel=canonical]")) { - return linkParser.parseLink(url, link); + return linkParser.parseLink(baseUrl, link); } return Optional.empty(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeTablesModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeTablesModule.java index bc9c2f44..4650b15b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeTablesModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeTablesModule.java @@ -2,17 +2,18 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.AbstractModule; import com.google.inject.name.Names; +import nu.marginalia.wmsa.configuration.WmsaHome; import java.nio.file.Path; public class EdgeTablesModule extends AbstractModule { public void configure() { - bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(Path.of("/var/lib/wmsa/index/write")); - bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(Path.of("/backup/work/index-tmp/")); + bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(WmsaHome.getDisk("index-write")); + bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(WmsaHome.getDisk("index-read")); - bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(Path.of("/var/lib/wmsa/index/read")); - bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(Path.of("/var/lib/wmsa/index/read")); + bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(WmsaHome.getDisk("tmp-slow")); + bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(WmsaHome.getDisk("tmp-fast")); bind(String.class).annotatedWith(Names.named("edge-writer-page-index-file")).toInstance("page-index.dat"); bind(String.class).annotatedWith(Names.named("edge-writer-dictionary-file")).toInstance("dictionary.dat"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index cb778947..53740c95 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -21,6 +21,7 @@ public class EdgeDomain implements WideHashable { @SneakyThrows public EdgeDomain(String host) { + Objects.requireNonNull(host, "domain name must not be null"); var dot = host.lastIndexOf('.'); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index 39bc475b..e82d4b7c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -79,11 +79,6 @@ public class EdgeUrl implements WideHashable { this.port = port(URI.getPort(), proto); } - public EdgeUrl sibling(String newPath) { - return new EdgeUrl(proto, domain, port, newPath); - } - - private static Integer port(Integer port, String protocol) { if (null == port || port < 1) { return null; @@ -120,5 +115,7 @@ public class EdgeUrl implements WideHashable { return (int) path.chars().filter(c -> c=='/').count(); } - + public EdgeUrl withPath(String s) { + return new EdgeUrl(proto, domain, port, s); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java index 9e1df8d5..9db18272 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java @@ -2,21 +2,14 @@ package nu.marginalia.wmsa.edge.search; import com.google.inject.AbstractModule; import nu.marginalia.util.language.conf.LanguageModels; - -import java.nio.file.Path; +import nu.marginalia.wmsa.configuration.WebsiteUrl; +import nu.marginalia.wmsa.configuration.WmsaHome; public class EdgeSearchModule extends AbstractModule { public void configure() { - - bind(LanguageModels.class).toInstance(new LanguageModels( - Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"), - Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"), - Path.of("/var/lib/wmsa/model/English.RDR"), - Path.of("/var/lib/wmsa/model/English.DICT"), - Path.of("/var/lib/wmsa/model/opennlp-tok.bin") - )); + bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); + bind(WebsiteUrl.class).toInstance(new WebsiteUrl(System.getProperty("website-url", "https://search.marginalia.nu/"))); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java index 329322a2..fa2d06e0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java @@ -8,6 +8,7 @@ import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.wmsa.api.model.ApiSearchResult; import nu.marginalia.wmsa.api.model.ApiSearchResults; +import nu.marginalia.wmsa.configuration.WebsiteUrl; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; @@ -34,7 +35,7 @@ public class EdgeSearchService extends Service { private final EdgeIndexClient indexClient; private final EdgeSearchOperator searchOperator; private final CommandEvaluator searchCommandEvaulator; - + private final WebsiteUrl websiteUrl; private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class); @SneakyThrows @@ -45,13 +46,14 @@ public class EdgeSearchService extends Service { Initialization initialization, MetricsServer metricsServer, EdgeSearchOperator searchOperator, - CommandEvaluator searchCommandEvaulator - ) { + CommandEvaluator searchCommandEvaulator, + WebsiteUrl websiteUrl) { super(ip, port, initialization, metricsServer); this.indexClient = indexClient; this.searchOperator = searchOperator; this.searchCommandEvaulator = searchCommandEvaulator; + this.websiteUrl = websiteUrl; Spark.staticFiles.expireTime(600); @@ -79,7 +81,7 @@ public class EdgeSearchService extends Service { final String query = URLEncoder.encode(String.format("%s site:%s", queryRaw, site), StandardCharsets.UTF_8); final String profile = request.queryParamOrDefault("profile", "yolo"); - response.redirect("https://search.marginalia.nu/search?query="+query+"&profile="+profile); + response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile)); return null; } @@ -141,7 +143,7 @@ public class EdgeSearchService extends Service { final String queryParam = request.queryParams("query"); if (null == queryParam || queryParam.isBlank()) { - response.redirect("https://search.marginalia.nu/"); + response.redirect(websiteUrl.url()); return null; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java index 2de9e931..30bac9d3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java @@ -7,7 +7,6 @@ import java.nio.file.Path; public class ResourceStoreModule extends AbstractModule { public void configure() { - bind(String.class).annotatedWith(Names.named("external-url")).toInstance("https://reddit.marginalia.nu/"); bind(Path.class).annotatedWith(Names.named("data-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/resources")); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java index 80c62153..d4a7e428 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java @@ -11,9 +11,8 @@ import static org.junit.jupiter.api.Assertions.*; class LinkParserTest { - private String parseLink(String href, String base) throws URISyntaxException { - var url = new EdgeUrl("http://www.marginalia.nu/" + base); - var domain = url.domain; + private String parseLink(String href, String relBase) throws URISyntaxException { + var url = new EdgeUrl("http://www.marginalia.nu/" + relBase); var parser = new LinkParser(); var stuff = Jsoup.parseBodyFragment("test"); var lnk = parser.parseLink( @@ -43,6 +42,7 @@ class LinkParserTest { void testAnchor() throws URISyntaxException { assertNull(parseLink("#test", "/")); } + @Test void testRelative() throws URISyntaxException { assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/")); @@ -51,4 +51,32 @@ class LinkParserTest { assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/foo/index.html")); assertEquals("http://www.marginalia.nu/test", parseLink("/test", "/foo/index.html")); } + + private EdgeUrl getBaseUrl(String href, EdgeUrl documentUrl) { + LinkParser lp = new LinkParser(); + + return lp.getBaseLink(Jsoup.parse(""), documentUrl); + } + + @Test + public void getBaseUrlTest() throws URISyntaxException { + assertEquals(new EdgeUrl("https://www.marginalia.nu/base"), + getBaseUrl("/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar"))); + + assertEquals(new EdgeUrl("https://memex.marginalia.nu/base"), + getBaseUrl("https://memex.marginalia.nu/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar"))); + + assertEquals(new EdgeUrl("https://www.marginalia.nu/test/base"), + getBaseUrl("base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar"))); + } + + @Test + public void testParseBadBaseLink() throws URISyntaxException { + LinkParser lp = new LinkParser(); + var url = new EdgeUrl("https://memex.marginalia.nu/"); + + assertEquals(url, lp.getBaseLink(Jsoup.parse(""), url)); + assertEquals(url, lp.getBaseLink(Jsoup.parse(""), url)); + assertEquals(url, lp.getBaseLink(Jsoup.parse(""), url)); + } } \ No newline at end of file