mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Don't try to fetch text/css and text/javascript-files. Refactor fetcher to separate content type sniffing logic. Clean up crawler a smidge.
This commit is contained in:
parent
6b6cd56e3a
commit
340d80f6c7
@ -20,7 +20,7 @@ import java.net.InetAddress;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.Collections;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Optional;
|
||||
@ -129,7 +129,7 @@ public class CrawlerRetreiver {
|
||||
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
|
||||
long crawlDelay = robotsRules.getCrawlDelay();
|
||||
|
||||
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, Collections.emptyList(), null);
|
||||
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
|
||||
|
||||
int fetchedCount = 0;
|
||||
|
||||
@ -137,7 +137,7 @@ public class CrawlerRetreiver {
|
||||
var top = queue.removeFirst();
|
||||
|
||||
if (!robotsRules.isAllowed(top.toString())) {
|
||||
ret.doc.add(createRobotsError(top));
|
||||
crawledDomainWriter.accept(createRobotsError(top));
|
||||
continue;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user