mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(feeds) Make feed XML parsing more lenient
... by consuming BOM markers and leading whitespace.
This commit is contained in:
parent
b66fb9caf6
commit
2dc9f2e639
@ -31,6 +31,7 @@ dependencies {
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.commons.io
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
|
@ -18,6 +18,7 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -29,6 +30,7 @@ import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDateTime;
|
||||
@ -150,7 +152,7 @@ public class FeedFetcherService {
|
||||
}
|
||||
|
||||
switch (feedData) {
|
||||
case FetchResult.Success(String value) -> writer.saveFeed(fetchFeed(value, feed));
|
||||
case FetchResult.Success(String value) -> writer.saveFeed(parseFeed(value, feed));
|
||||
case FetchResult.TransientError() -> {
|
||||
int errorCount = errorCounts.getOrDefault(feed.domain().toLowerCase(), 0);
|
||||
writer.setErrorCount(feed.domain().toLowerCase(), ++errorCount);
|
||||
@ -296,9 +298,12 @@ public class FeedFetcherService {
|
||||
}
|
||||
}
|
||||
|
||||
public FeedItems fetchFeed(String feedData, FeedDefinition definition) {
|
||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||
try {
|
||||
List<Item> rawItems = rssReader.read(new ByteArrayInputStream(feedData.getBytes())).toList();
|
||||
List<Item> rawItems = rssReader.read(
|
||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
||||
).toList();
|
||||
|
||||
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user