(link-parser) Filter out URLs with binary file suffixes in LinkParser

Added an additional filter step to ensure URLs with binary suffixes are excluded during crawling. This prevents unnecessary processing of non-HTML content, improving the efficiency of the link parsing process.
This commit is contained in:
Viktor Lofgren 2024-12-11 16:42:47 +01:00
parent a97c05107e
commit 3b99cffb3d

View File

@ -42,7 +42,8 @@ public class LinkParser {
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
.flatMap(this::createEdgeUrl);
.flatMap(this::createEdgeUrl)
.filter(url -> !hasBinarySuffix(url.path));
}
@Contract(pure=true)