mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 21:29:00 +00:00

Look, this will make the git history look funny, but trimming unnecessary depth from the source tree is a very necessary sanity-preserving measure when dealing with a super-modularized codebase like this one. While it makes the project configuration a bit less conventional, it will save you several clicks every time you jump between modules. Which you'll do a lot, because it's *modul*ar. The src/main/java convention makes a lot of sense for a non-modular project though. This ain't that.
90 lines
2.6 KiB
Java
90 lines
2.6 KiB
Java
package nu.marginalia.util;
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
import javax.annotation.Nullable;
|
|
import java.util.ArrayList;
|
|
import java.util.Comparator;
|
|
import java.util.List;
|
|
import java.util.StringJoiner;
|
|
|
|
public class QueryParams {
|
|
|
|
|
|
@Nullable
|
|
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
|
|
if (queryParams == null) {
|
|
return null;
|
|
}
|
|
|
|
String ret;
|
|
if (queryParams.indexOf('&') >= 0) {
|
|
|
|
List<String> parts = new ArrayList<>();
|
|
for (var part : StringUtils.split(queryParams, '&')) {
|
|
if (QueryParams.isPermittedParam(path, part)) {
|
|
parts.add(part);
|
|
}
|
|
}
|
|
if (parts.size() > 1) {
|
|
parts.sort(Comparator.naturalOrder());
|
|
}
|
|
StringJoiner retJoiner = new StringJoiner("&");
|
|
parts.forEach(retJoiner::add);
|
|
ret = retJoiner.toString();
|
|
}
|
|
else if (isPermittedParam(path, queryParams)) {
|
|
ret = queryParams;
|
|
}
|
|
else {
|
|
return null;
|
|
}
|
|
|
|
if (ret.isBlank())
|
|
return null;
|
|
|
|
return ret;
|
|
}
|
|
|
|
public static boolean isPermittedParam(String path, String param) {
|
|
if (path.endsWith(".cgi")) return true;
|
|
|
|
if (path.endsWith("/posting.php")) return false;
|
|
|
|
if (param.startsWith("id=")) return true;
|
|
if (param.startsWith("p=")) {
|
|
// Don't retain forum links with post-id:s, they're always non-canonical and eat up a lot of
|
|
// crawling bandwidth
|
|
|
|
if (path.endsWith("showthread.php") || path.endsWith("viewtopic.php")) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
if (param.startsWith("f=")) {
|
|
if (path.endsWith("showthread.php") || path.endsWith("viewtopic.php")) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
if (param.startsWith("i=")) return true;
|
|
if (param.startsWith("start=")) return true;
|
|
if (param.startsWith("t=")) return true;
|
|
if (param.startsWith("v=")) return true;
|
|
|
|
if (param.startsWith("post=")) return true;
|
|
|
|
if (path.endsWith("index.php")) {
|
|
if (param.startsWith("showtopic="))
|
|
return true;
|
|
if (param.startsWith("showforum="))
|
|
return true;
|
|
}
|
|
|
|
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
|
return param.startsWith("project=") || param.startsWith("story=");
|
|
}
|
|
return false;
|
|
}
|
|
}
|