2023-03-04 12:19:01 +00:00
|
|
|
package nu.marginalia.util;
|
2022-07-08 14:36:09 +00:00
|
|
|
|
2022-10-19 13:00:04 +00:00
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
|
2022-07-08 14:36:09 +00:00
|
|
|
import javax.annotation.Nullable;
|
2022-10-19 13:00:04 +00:00
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Comparator;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.StringJoiner;
|
2022-07-08 14:36:09 +00:00
|
|
|
|
|
|
|
public class QueryParams {
|
|
|
|
|
|
|
|
@Nullable
|
|
|
|
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
|
|
|
|
if (queryParams == null) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2022-10-19 13:00:04 +00:00
|
|
|
String ret;
|
|
|
|
if (queryParams.indexOf('&') >= 0) {
|
|
|
|
|
|
|
|
List<String> parts = new ArrayList<>();
|
|
|
|
for (var part : StringUtils.split(queryParams, '&')) {
|
|
|
|
if (QueryParams.isPermittedParam(path, part)) {
|
|
|
|
parts.add(part);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (parts.size() > 1) {
|
|
|
|
parts.sort(Comparator.naturalOrder());
|
|
|
|
}
|
|
|
|
StringJoiner retJoiner = new StringJoiner("&");
|
|
|
|
parts.forEach(retJoiner::add);
|
|
|
|
ret = retJoiner.toString();
|
|
|
|
}
|
|
|
|
else if (isPermittedParam(path, queryParams)) {
|
|
|
|
ret = queryParams;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return null;
|
|
|
|
}
|
2022-07-08 14:36:09 +00:00
|
|
|
|
|
|
|
if (ret.isBlank())
|
|
|
|
return null;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static boolean isPermittedParam(String path, String param) {
|
2022-08-16 20:48:16 +00:00
|
|
|
if (path.endsWith(".cgi")) return true;
|
|
|
|
|
2022-09-16 15:12:07 +00:00
|
|
|
if (path.endsWith("/posting.php")) return false;
|
|
|
|
|
2022-08-16 20:48:16 +00:00
|
|
|
if (param.startsWith("id=")) return true;
|
2022-09-16 14:59:06 +00:00
|
|
|
if (param.startsWith("p=")) {
|
|
|
|
// Don't retain forum links with post-id:s, they're always non-canonical and eat up a lot of
|
|
|
|
// crawling bandwidth
|
|
|
|
|
|
|
|
if (path.endsWith("showthread.php") || path.endsWith("viewtopic.php")) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2022-09-16 15:01:06 +00:00
|
|
|
if (param.startsWith("f=")) {
|
|
|
|
if (path.endsWith("showthread.php") || path.endsWith("viewtopic.php")) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2022-08-16 20:48:16 +00:00
|
|
|
if (param.startsWith("i=")) return true;
|
2022-09-16 14:59:06 +00:00
|
|
|
if (param.startsWith("start=")) return true;
|
2022-08-16 20:48:16 +00:00
|
|
|
if (param.startsWith("t=")) return true;
|
|
|
|
if (param.startsWith("v=")) return true;
|
2022-09-16 15:01:06 +00:00
|
|
|
|
2022-08-16 20:48:16 +00:00
|
|
|
if (param.startsWith("post=")) return true;
|
|
|
|
|
2022-07-08 14:36:09 +00:00
|
|
|
if (path.endsWith("index.php")) {
|
2022-08-12 11:50:18 +00:00
|
|
|
if (param.startsWith("showtopic="))
|
2022-07-08 14:36:09 +00:00
|
|
|
return true;
|
2022-08-12 11:50:18 +00:00
|
|
|
if (param.startsWith("showforum="))
|
2022-07-08 14:36:09 +00:00
|
|
|
return true;
|
|
|
|
}
|
2022-08-12 11:50:18 +00:00
|
|
|
|
|
|
|
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
|
|
|
return param.startsWith("project=") || param.startsWith("story=");
|
|
|
|
}
|
2025-01-07 19:21:44 +00:00
|
|
|
|
|
|
|
// www.perseus.tufts.edu:
|
|
|
|
if (param.startsWith("collection=")) return true;
|
|
|
|
if (param.startsWith("doc=")) return true;
|
|
|
|
|
2022-07-08 14:36:09 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|