mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(crawler) Improve meta-tag redirect handling, add tests for redirects.
Wrote a new test to examine the redirect behavior of the crawler, ensuring that the redirect URL is the URL that is reported in the parquet file. This works as intended. Noticed in the course of this that the crawler doesn't add links from meta-tag redirects to the crawl frontier. Added logic to handle this case, amended the test case to verify the new behavior. Added the meta-redirect case to the HtmlDocumentProcessorPlugin as well, so that we consider it a link between documents in the unlikely case that a meta redirect is to another domain.
This commit is contained in:
parent
93a2d5afbf
commit
785d8deadd
@ -20,9 +20,11 @@ import java.util.regex.Pattern;
|
|||||||
public class LinkParser {
|
public class LinkParser {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final List<String> blockPrefixList = List.of(
|
// These are schemas that we don't want to try to index
|
||||||
|
private final List<String> blockedSchemaList = List.of(
|
||||||
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
|
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
|
||||||
|
|
||||||
|
// These are file suffixes we suspect may be a binary file
|
||||||
private final List<String> binarySuffixList = List.of(
|
private final List<String> binarySuffixList = List.of(
|
||||||
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
|
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
|
||||||
".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar",
|
".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar",
|
||||||
@ -96,6 +98,30 @@ public class LinkParser {
|
|||||||
.flatMap(this::createEdgeUrl);
|
.flatMap(this::createEdgeUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Contract(pure=true)
|
||||||
|
public Optional<EdgeUrl> parseMetaRedirect(EdgeUrl baseUrl, Element meta) {
|
||||||
|
return Optional.of(meta)
|
||||||
|
.map(l -> l.attr("content"))
|
||||||
|
.flatMap(this::getMetaRedirectUrl)
|
||||||
|
.map(link -> resolveRelativeUrl(baseUrl, link))
|
||||||
|
.flatMap(this::createURI)
|
||||||
|
.map(URI::normalize)
|
||||||
|
.map(this::renormalize)
|
||||||
|
.flatMap(this::createEdgeUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Matches the format of a meta http-equiv=refresh content tag, e.g. '10; url=http://example.com/'
|
||||||
|
private static Pattern metaRedirectPattern = Pattern.compile("^\\d+\\s*;\\s*url=(\\S+)\\s*$");
|
||||||
|
/** Parse the URL from a meta refresh tag, returning only the URL part and
|
||||||
|
* discarding the rest. Returns Optional.empty() on parse error. */
|
||||||
|
private Optional<String> getMetaRedirectUrl(String content) {
|
||||||
|
var matcher = metaRedirectPattern.matcher(content);
|
||||||
|
|
||||||
|
if (!matcher.find())
|
||||||
|
return Optional.empty();
|
||||||
|
return Optional.ofNullable(matcher.group(1));
|
||||||
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private URI renormalize(URI uri) {
|
private URI renormalize(URI uri) {
|
||||||
if (uri.getPath() == null) {
|
if (uri.getPath() == null) {
|
||||||
@ -191,7 +217,7 @@ public class LinkParser {
|
|||||||
}
|
}
|
||||||
href = href.toLowerCase();
|
href = href.toLowerCase();
|
||||||
|
|
||||||
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
if (blockedSchemaList.stream().anyMatch(href::startsWith)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (hasBinarySuffix(href)) {
|
if (hasBinarySuffix(href)) {
|
||||||
|
@ -284,6 +284,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
for (var frame : doc.getElementsByTag("iframe")) {
|
for (var frame : doc.getElementsByTag("iframe")) {
|
||||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
|
for (var meta : doc.select("meta[http-equiv=refresh]")) {
|
||||||
|
linkParser.parseMetaRedirect(baseUrl, meta).ifPresent(lp::accept);
|
||||||
|
}
|
||||||
for (var link : doc.select("link[rel=alternate]")) {
|
for (var link : doc.select("link[rel=alternate]")) {
|
||||||
feedExtractor
|
feedExtractor
|
||||||
.getFeedFromAlternateTag(baseUrl, link)
|
.getFeedFromAlternateTag(baseUrl, link)
|
||||||
|
@ -79,6 +79,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For testing
|
||||||
|
public DomainCrawlFrontier getCrawlFrontier() {
|
||||||
|
return crawlFrontier;
|
||||||
|
}
|
||||||
|
|
||||||
public int fetch() {
|
public int fetch() {
|
||||||
return fetch(new DomainLinks(), new CrawlDataReference());
|
return fetch(new DomainLinks(), new CrawlDataReference());
|
||||||
}
|
}
|
||||||
|
@ -111,6 +111,10 @@ public class DomainCrawlFrontier {
|
|||||||
long hashCode = hasher.hashNearlyASCII(url.toString());
|
long hashCode = hasher.hashNearlyASCII(url.toString());
|
||||||
return visited.contains(hashCode);
|
return visited.contains(hashCode);
|
||||||
}
|
}
|
||||||
|
public boolean isKnown(EdgeUrl url) {
|
||||||
|
long hashCode = hasher.hashNearlyASCII(url.toString());
|
||||||
|
return known.contains(hashCode);
|
||||||
|
}
|
||||||
|
|
||||||
public boolean filterLink(EdgeUrl url) {
|
public boolean filterLink(EdgeUrl url) {
|
||||||
return linkFilter.test(url);
|
return linkFilter.test(url);
|
||||||
@ -162,6 +166,9 @@ public class DomainCrawlFrontier {
|
|||||||
for (var link : parsed.getElementsByTag("frame")) {
|
for (var link : parsed.getElementsByTag("frame")) {
|
||||||
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
||||||
}
|
}
|
||||||
|
for (var meta : parsed.select("meta[http-equiv=refresh]")) {
|
||||||
|
linkParser.parseMetaRedirect(baseUrl, meta).ifPresent(this::addToQueue);
|
||||||
|
}
|
||||||
for (var link : parsed.getElementsByTag("iframe")) {
|
for (var link : parsed.getElementsByTag("iframe")) {
|
||||||
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
||||||
}
|
}
|
||||||
|
@ -17,18 +17,19 @@ import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
class CrawlerRetreiverTest {
|
class CrawlerRetreiverTest {
|
||||||
@ -209,6 +210,62 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRedirect() throws IOException, URISyntaxException {
|
||||||
|
var specs = CrawlSpecRecord
|
||||||
|
.builder()
|
||||||
|
.crawlDepth(3)
|
||||||
|
.domain("www.marginalia.nu")
|
||||||
|
.urls(List.of(
|
||||||
|
"https://www.marginalia.nu/log/06-optimization.gmi"
|
||||||
|
))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
|
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
||||||
|
|
||||||
|
DomainCrawlFrontier frontier = doCrawl(tempFileWarc1, specs);
|
||||||
|
|
||||||
|
assertTrue(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization.gmi/")));
|
||||||
|
assertTrue(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization.gmi")));
|
||||||
|
assertTrue(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/")));
|
||||||
|
|
||||||
|
assertFalse(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
||||||
|
assertTrue(frontier.isKnown(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
||||||
|
|
||||||
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
|
|
||||||
|
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) {
|
||||||
|
while (stream.hasNext()) {
|
||||||
|
if (stream.next() instanceof CrawledDocument doc) {
|
||||||
|
data.add(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// The URL https://www.marginalia.nu/log/06-optimization.gmi
|
||||||
|
// redirects to https://www.marginalia.nu/log/06-optimization.gmi/ (note the trailing slash)
|
||||||
|
//
|
||||||
|
// Ensure that the redirect is followed, and that the trailing slash is added
|
||||||
|
// to the url as reported in the parquet file.
|
||||||
|
|
||||||
|
var fetchedUrls =
|
||||||
|
data.stream()
|
||||||
|
.filter(CrawledDocument.class::isInstance)
|
||||||
|
.map(CrawledDocument.class::cast)
|
||||||
|
.peek(doc -> System.out.println(doc.url))
|
||||||
|
.map(doc -> doc.url)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
assertEquals(Set.of("https://www.marginalia.nu/",
|
||||||
|
"https://www.marginalia.nu/log/06-optimization.gmi/"),
|
||||||
|
fetchedUrls);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEmptySet() throws IOException {
|
public void testEmptySet() throws IOException {
|
||||||
|
|
||||||
@ -418,11 +475,15 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
|
@NotNull
|
||||||
|
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
|
||||||
|
crawler.fetch();
|
||||||
|
return crawler.getCrawlFrontier();
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
Assertions.fail(ex);
|
Assertions.fail(ex);
|
||||||
|
return null; // unreachable
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user