mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 21:29:00 +00:00

Look, this will make the git history look funny, but trimming unnecessary depth from the source tree is a very necessary sanity-preserving measure when dealing with a super-modularized codebase like this one. While it makes the project configuration a bit less conventional, it will save you several clicks every time you jump between modules. Which you'll do a lot, because it's *modul*ar. The src/main/java convention makes a lot of sense for a non-modular project though. This ain't that.
31 lines
1.0 KiB
Java
31 lines
1.0 KiB
Java
package nu.marginalia.crawling;
|
|
|
|
import crawlercommons.robots.SimpleRobotRules;
|
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
import static org.junit.jupiter.api.Assertions.*;
|
|
|
|
class DomainCrawlerRobotsTxtTest {
|
|
@Test
|
|
public void testOverride() {
|
|
String contentsStr = "User-agent: *\n" +
|
|
"Disallow: /\n" +
|
|
"\n" +
|
|
"User-agent: Googlebot\n" +
|
|
"User-agent: YandexBot\n" +
|
|
"User-agent: Twitterbot\n" +
|
|
"User-agent: special_archiver\n" +
|
|
"User-agent: archive.org_bot\n" +
|
|
"User-agent: search.marginalia.nu\n" +
|
|
"Disallow:\n";
|
|
|
|
byte[] contents = contentsStr.getBytes();
|
|
SimpleRobotRules rules = new SimpleRobotRulesParser().parseContent("https://www.brutman.com/robots.txt",
|
|
contents,
|
|
"text/plain",
|
|
"search.marginalia.nu");
|
|
|
|
assertTrue(rules.isAllowed("http://www.brutman.com/test"));
|
|
}
|
|
} |