2023-03-04 12:19:01 +00:00
|
|
|
package nu.marginalia.crawling;
|
2022-05-19 15:45:26 +00:00
|
|
|
|
2024-09-23 15:51:07 +00:00
|
|
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
|
|
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
|
|
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
|
|
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
2023-03-04 12:19:01 +00:00
|
|
|
import nu.marginalia.model.EdgeUrl;
|
2024-09-08 08:22:32 +00:00
|
|
|
import nu.marginalia.model.body.ContentTypeLogic;
|
|
|
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
|
|
|
import nu.marginalia.model.body.DocumentBodyResult;
|
2022-05-19 15:45:26 +00:00
|
|
|
import org.junit.jupiter.api.Assertions;
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
|
|
|
|
class HttpFetcherTest {
|
|
|
|
|
|
|
|
@Test
|
2024-11-11 20:14:38 +00:00
|
|
|
void testUrlPattern() throws Exception {
|
2022-08-18 16:25:09 +00:00
|
|
|
ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
|
|
|
|
|
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.txt")));
|
|
|
|
Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.bin")));
|
|
|
|
Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.tar.gz")));
|
|
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.htm")));
|
|
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.html")));
|
|
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log")));
|
|
|
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.php?id=1")));
|
2022-05-19 15:45:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
2024-11-11 20:14:38 +00:00
|
|
|
void fetchUTF8() throws Exception {
|
2023-06-24 18:09:54 +00:00
|
|
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
2023-12-08 16:12:51 +00:00
|
|
|
try (var recorder = new WarcRecorder()) {
|
2024-08-31 09:32:56 +00:00
|
|
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
2023-12-14 15:05:48 +00:00
|
|
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
2023-12-13 14:33:42 +00:00
|
|
|
System.out.println(bodyOk.contentType());
|
|
|
|
}
|
2023-12-08 16:12:51 +00:00
|
|
|
}
|
2022-05-19 15:45:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
2024-11-11 20:14:38 +00:00
|
|
|
void fetchText() throws Exception {
|
2023-06-24 18:09:54 +00:00
|
|
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
2023-12-08 16:12:51 +00:00
|
|
|
|
|
|
|
try (var recorder = new WarcRecorder()) {
|
2024-08-31 09:32:56 +00:00
|
|
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
2023-12-14 15:05:48 +00:00
|
|
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
2023-12-13 14:33:42 +00:00
|
|
|
System.out.println(bodyOk.contentType());
|
|
|
|
}
|
2023-12-08 16:12:51 +00:00
|
|
|
}
|
2022-05-19 15:45:26 +00:00
|
|
|
}
|
|
|
|
}
|