(feeds) Add logic to handle URI fragments in feed items

Introduced a method to decide whether to retain URI fragments in feed items based on their uniqueness. Enhanced FeedItem processing to conditionally strip fragments to maintain clean URLs where applicable.
This commit is contained in:
Viktor Lofgren 2024-11-23 16:38:06 +01:00
parent df298df852
commit 923ebbac81
2 changed files with 63 additions and 4 deletions

View File

@ -5,6 +5,7 @@ import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup;
import java.net.URI;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Optional;
@ -17,11 +18,27 @@ public record FeedItem(String title,
public static final int MAX_DESC_LENGTH = 255;
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
public static FeedItem fromItem(Item item) {
public static FeedItem fromItem(Item item, boolean keepFragment) {
String title = item.getTitle().orElse("");
String date = getItemDate(item);
String description = getItemDescription(item);
String url = item.getLink().orElse("");
String url;
if (keepFragment || item.getLink().isEmpty()) {
url = item.getLink().orElse("");
}
else {
try {
String link = item.getLink().get();
var linkUri = new URI(link);
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
url = cleanUri.toString();
}
catch (Exception e) {
// fallback to original link if we can't clean it, this is not a very important step
url = item.getLink().get();
}
}
return new FeedItem(title, date, description, url);
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.rss.svc;
import com.apptasticsoftware.rssreader.Item;
import com.apptasticsoftware.rssreader.RssReader;
import com.google.inject.Inject;
import com.opencsv.CSVReader;
@ -21,6 +22,8 @@ import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.sql.SQLException;
import java.time.Duration;
@ -230,8 +233,12 @@ public class FeedFetcherService {
public FeedItems fetchFeed(FeedDefinition definition) {
try {
var items = rssReader.read(definition.feedUrl())
.map(FeedItem::fromItem)
List<Item> rawItems = rssReader.read(definition.feedUrl()).toList();
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
var items = rawItems.stream()
.map(item -> FeedItem.fromItem(item, keepUriFragment))
.filter(new IsFeedItemDateValid())
.sorted()
.limit(MAX_FEED_ITEMS)
@ -249,6 +256,41 @@ public class FeedFetcherService {
}
}
/** Decide whether to keep URI fragments in the feed items.
* <p></p>
* We keep fragments if there are multiple different fragments in the items.
*
* @param items The items to check
* @return True if we should keep the fragments, false otherwise
*/
private boolean areFragmentsDisparate(List<Item> items) {
Set<String> seenFragments = new HashSet<>();
try {
for (var item : items) {
if (item.getLink().isEmpty()) {
continue;
}
var link = item.getLink().get();
if (!link.contains("#")) {
continue;
}
var fragment = new URI(link).getFragment();
if (fragment != null) {
seenFragments.add(fragment);
}
}
}
catch (URISyntaxException e) {
logger.debug("Exception", e);
return true; // safe default
}
return seenFragments.size() > 1;
}
private static class IsFeedItemDateValid implements Predicate<FeedItem> {
private final String today = ZonedDateTime.now().format(DateTimeFormatter.ISO_ZONED_DATE_TIME);