From 923ebbac81624e556f50a9de29248cc5a9fd3a34 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 23 Nov 2024 16:38:06 +0100 Subject: [PATCH] (feeds) Add logic to handle URI fragments in feed items Introduced a method to decide whether to retain URI fragments in feed items based on their uniqueness. Enhanced FeedItem processing to conditionally strip fragments to maintain clean URLs where applicable. --- .../nu/marginalia/rss/model/FeedItem.java | 21 ++++++++- .../rss/svc/FeedFetcherService.java | 46 ++++++++++++++++++- 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java b/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java index e55ed6b6..9d544a20 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java @@ -5,6 +5,7 @@ import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.NotNull; import org.jsoup.Jsoup; +import java.net.URI; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.util.Optional; @@ -17,11 +18,27 @@ public record FeedItem(String title, public static final int MAX_DESC_LENGTH = 255; public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ"); - public static FeedItem fromItem(Item item) { + public static FeedItem fromItem(Item item, boolean keepFragment) { String title = item.getTitle().orElse(""); String date = getItemDate(item); String description = getItemDescription(item); - String url = item.getLink().orElse(""); + String url; + + if (keepFragment || item.getLink().isEmpty()) { + url = item.getLink().orElse(""); + } + else { + try { + String link = item.getLink().get(); + var linkUri = new URI(link); + var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null); + url = cleanUri.toString(); + } + catch (Exception e) { + // fallback to original link if we can't clean it, this is not a very important step + url = item.getLink().get(); + } + } return new FeedItem(title, date, description, url); } diff --git a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java index de44b7be..5cbf6019 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java @@ -1,5 +1,6 @@ package nu.marginalia.rss.svc; +import com.apptasticsoftware.rssreader.Item; import com.apptasticsoftware.rssreader.RssReader; import com.google.inject.Inject; import com.opencsv.CSVReader; @@ -21,6 +22,8 @@ import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.http.HttpClient; import java.sql.SQLException; import java.time.Duration; @@ -230,8 +233,12 @@ public class FeedFetcherService { public FeedItems fetchFeed(FeedDefinition definition) { try { - var items = rssReader.read(definition.feedUrl()) - .map(FeedItem::fromItem) + List rawItems = rssReader.read(definition.feedUrl()).toList(); + + boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems); + + var items = rawItems.stream() + .map(item -> FeedItem.fromItem(item, keepUriFragment)) .filter(new IsFeedItemDateValid()) .sorted() .limit(MAX_FEED_ITEMS) @@ -249,6 +256,41 @@ public class FeedFetcherService { } } + /** Decide whether to keep URI fragments in the feed items. + *

+ * We keep fragments if there are multiple different fragments in the items. + * + * @param items The items to check + * @return True if we should keep the fragments, false otherwise + */ + private boolean areFragmentsDisparate(List items) { + Set seenFragments = new HashSet<>(); + + try { + for (var item : items) { + if (item.getLink().isEmpty()) { + continue; + } + + var link = item.getLink().get(); + if (!link.contains("#")) { + continue; + } + + var fragment = new URI(link).getFragment(); + if (fragment != null) { + seenFragments.add(fragment); + } + } + } + catch (URISyntaxException e) { + logger.debug("Exception", e); + return true; // safe default + } + + return seenFragments.size() > 1; + } + private static class IsFeedItemDateValid implements Predicate { private final String today = ZonedDateTime.now().format(DateTimeFormatter.ISO_ZONED_DATE_TIME);