Some small crawler tweaks, plus a test for examining crawler behavior through a simulated server.

This commit is contained in:
vlofgren 2022-09-16 16:59:06 +02:00
parent 5e67391829
commit 123603b0a3

View File

@ -30,8 +30,17 @@ public class QueryParams {
if (path.endsWith(".cgi")) return true; if (path.endsWith(".cgi")) return true;
if (param.startsWith("id=")) return true; if (param.startsWith("id=")) return true;
if (param.startsWith("p=")) return true; if (param.startsWith("p=")) {
// Don't retain forum links with post-id:s, they're always non-canonical and eat up a lot of
// crawling bandwidth
if (path.endsWith("showthread.php") || path.endsWith("viewtopic.php")) {
return false;
}
return true;
}
if (param.startsWith("i=")) return true; if (param.startsWith("i=")) return true;
if (param.startsWith("start=")) return true;
if (param.startsWith("t=")) return true; if (param.startsWith("t=")) return true;
if (param.startsWith("v=")) return true; if (param.startsWith("v=")) return true;
if (param.startsWith("post=")) return true; if (param.startsWith("post=")) return true;