From 3b99cffb3dcfa459aa1cb8cdb2b2a4ed5a05b0f3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 11 Dec 2024 16:42:47 +0100 Subject: [PATCH] (link-parser) Filter out URLs with binary file suffixes in LinkParser Added an additional filter step to ensure URLs with binary suffixes are excluded during crawling. This prevents unnecessary processing of non-HTML content, improving the efficiency of the link parsing process. --- .../java/nu/marginalia/link_parser/LinkParser.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java b/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java index 8a04863d..717ae8a5 100644 --- a/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java +++ b/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java @@ -42,7 +42,8 @@ public class LinkParser { .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) - .flatMap(this::createEdgeUrl); + .flatMap(this::createEdgeUrl) + .filter(url -> !hasBinarySuffix(url.path)); } @Contract(pure=true)