mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(feeds) Add logic to handle URI fragments in feed items
Introduced a method to decide whether to retain URI fragments in feed items based on their uniqueness. Enhanced FeedItem processing to conditionally strip fragments to maintain clean URLs where applicable.
This commit is contained in:
parent
df298df852
commit
923ebbac81
@ -5,6 +5,7 @@ import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.net.URI;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Optional;
|
||||
@ -17,11 +18,27 @@ public record FeedItem(String title,
|
||||
public static final int MAX_DESC_LENGTH = 255;
|
||||
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
||||
|
||||
public static FeedItem fromItem(Item item) {
|
||||
public static FeedItem fromItem(Item item, boolean keepFragment) {
|
||||
String title = item.getTitle().orElse("");
|
||||
String date = getItemDate(item);
|
||||
String description = getItemDescription(item);
|
||||
String url = item.getLink().orElse("");
|
||||
String url;
|
||||
|
||||
if (keepFragment || item.getLink().isEmpty()) {
|
||||
url = item.getLink().orElse("");
|
||||
}
|
||||
else {
|
||||
try {
|
||||
String link = item.getLink().get();
|
||||
var linkUri = new URI(link);
|
||||
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
||||
url = cleanUri.toString();
|
||||
}
|
||||
catch (Exception e) {
|
||||
// fallback to original link if we can't clean it, this is not a very important step
|
||||
url = item.getLink().get();
|
||||
}
|
||||
}
|
||||
|
||||
return new FeedItem(title, date, description, url);
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.apptasticsoftware.rssreader.Item;
|
||||
import com.apptasticsoftware.rssreader.RssReader;
|
||||
import com.google.inject.Inject;
|
||||
import com.opencsv.CSVReader;
|
||||
@ -21,6 +22,8 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
@ -230,8 +233,12 @@ public class FeedFetcherService {
|
||||
|
||||
public FeedItems fetchFeed(FeedDefinition definition) {
|
||||
try {
|
||||
var items = rssReader.read(definition.feedUrl())
|
||||
.map(FeedItem::fromItem)
|
||||
List<Item> rawItems = rssReader.read(definition.feedUrl()).toList();
|
||||
|
||||
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
||||
|
||||
var items = rawItems.stream()
|
||||
.map(item -> FeedItem.fromItem(item, keepUriFragment))
|
||||
.filter(new IsFeedItemDateValid())
|
||||
.sorted()
|
||||
.limit(MAX_FEED_ITEMS)
|
||||
@ -249,6 +256,41 @@ public class FeedFetcherService {
|
||||
}
|
||||
}
|
||||
|
||||
/** Decide whether to keep URI fragments in the feed items.
|
||||
* <p></p>
|
||||
* We keep fragments if there are multiple different fragments in the items.
|
||||
*
|
||||
* @param items The items to check
|
||||
* @return True if we should keep the fragments, false otherwise
|
||||
*/
|
||||
private boolean areFragmentsDisparate(List<Item> items) {
|
||||
Set<String> seenFragments = new HashSet<>();
|
||||
|
||||
try {
|
||||
for (var item : items) {
|
||||
if (item.getLink().isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var link = item.getLink().get();
|
||||
if (!link.contains("#")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var fragment = new URI(link).getFragment();
|
||||
if (fragment != null) {
|
||||
seenFragments.add(fragment);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (URISyntaxException e) {
|
||||
logger.debug("Exception", e);
|
||||
return true; // safe default
|
||||
}
|
||||
|
||||
return seenFragments.size() > 1;
|
||||
}
|
||||
|
||||
private static class IsFeedItemDateValid implements Predicate<FeedItem> {
|
||||
private final String today = ZonedDateTime.now().format(DateTimeFormatter.ISO_ZONED_DATE_TIME);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user