mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 21:29:00 +00:00
data:image/s3,"s3://crabby-images/c765d/c765d5283f4176ac41b612e7ae83ed62e7ddf9a1" alt="Viktor Lofgren"
Refactoring keyword extraction to extract spans information. Modifying the intermediate storage of converted data to use the new slop library, which is allows for easier storage of ad-hoc binary data like spans and positions. This is a bit of a katamari damacy commit that ended up dragging along a bunch of other fairly tangentially related changes that are hard to break out into separate commits after the fact. Will push as-is to get back to being able to do more isolated work.
56 lines
2.6 KiB
Java
56 lines
2.6 KiB
Java
package nu.marginalia.crawling;
|
|
|
|
import lombok.SneakyThrows;
|
|
import nu.marginalia.crawl.retreival.RateLimitException;
|
|
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
|
import nu.marginalia.model.EdgeUrl;
|
|
import nu.marginalia.model.body.ContentTypeLogic;
|
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
|
import nu.marginalia.model.body.DocumentBodyResult;
|
|
import org.junit.jupiter.api.Assertions;
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
import java.io.IOException;
|
|
import java.net.URISyntaxException;
|
|
|
|
class HttpFetcherTest {
|
|
|
|
@SneakyThrows
|
|
@Test
|
|
void testUrlPattern() {
|
|
ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
|
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.txt")));
|
|
Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.bin")));
|
|
Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.tar.gz")));
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.htm")));
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.html")));
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log")));
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.php?id=1")));
|
|
}
|
|
|
|
@Test
|
|
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
|
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
|
try (var recorder = new WarcRecorder()) {
|
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty());
|
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
|
System.out.println(bodyOk.contentType());
|
|
}
|
|
}
|
|
}
|
|
|
|
@Test
|
|
void fetchText() throws URISyntaxException, RateLimitException, IOException {
|
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
|
|
|
try (var recorder = new WarcRecorder()) {
|
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty());
|
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
|
System.out.println(bodyOk.contentType());
|
|
}
|
|
}
|
|
}
|
|
} |