mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(link-parser) Filter out URLs with binary file suffixes in LinkParser
Added an additional filter step to ensure URLs with binary suffixes are excluded during crawling. This prevents unnecessary processing of non-HTML content, improving the efficiency of the link parsing process.
This commit is contained in:
parent
a97c05107e
commit
3b99cffb3d
@ -42,7 +42,8 @@ public class LinkParser {
|
|||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
.flatMap(this::createEdgeUrl);
|
.flatMap(this::createEdgeUrl)
|
||||||
|
.filter(url -> !hasBinarySuffix(url.path));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Contract(pure=true)
|
@Contract(pure=true)
|
||||||
|
Loading…
Reference in New Issue
Block a user