diff --git a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java index 396fee1e..646c2788 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java @@ -316,6 +316,8 @@ public class FeedFetcherService { public FeedItems parseFeed(String feedData, FeedDefinition definition) { try { + feedData = sanitizeEntities(feedData); + List rawItems = rssReader.read( // Massage the data to maximize the possibility of the flaky XML parser consuming it new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false) @@ -342,6 +344,32 @@ public class FeedFetcherService { } } + private static final Map HTML_ENTITIES = Map.of( + "»", "»", + "«", "«", + "—", "--", + "–", "-", + "’", "'", + "‘", "'", + " ", "" + ); + + /** The XML parser will blow up if you insert HTML entities in the feed XML, + * which is unfortunately relatively common. Replace them as far as is possible + * with their corresponding characters + */ + static String sanitizeEntities(String feedData) { + String result = feedData; + for (Map.Entry entry : HTML_ENTITIES.entrySet()) { + result = result.replace(entry.getKey(), entry.getValue()); + } + + // Handle lone ampersands not part of a recognized XML entity + result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&"); + + return result; + } + /** Decide whether to keep URI fragments in the feed items. *

* We keep fragments if there are multiple different fragments in the items. diff --git a/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java b/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java index 88fb07cf..3ddd7f49 100644 --- a/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java +++ b/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java @@ -99,7 +99,9 @@ class FeedFetcherServiceTest extends AbstractModule { feedFetcherService.setDeterministic(); feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); - Assertions.assertFalse(feedDb.getFeed(new EdgeDomain("www.marginalia.nu")).isEmpty()); + var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu")); + System.out.println(result); + Assertions.assertFalse(result.isEmpty()); } @Tag("flaky") diff --git a/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java b/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java new file mode 100644 index 00000000..36bb92f7 --- /dev/null +++ b/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java @@ -0,0 +1,26 @@ +package nu.marginalia.rss.svc; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class TestXmlSanitization { + + @Test + public void testPreservedEntities() { + Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&")); + Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<")); + Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">")); + Assertions.assertEquals(""", FeedFetcherService.sanitizeEntities(""")); + Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'")); + } + + @Test + public void testStrayAmpersand() { + Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast")); + } + + @Test + public void testTranslatedHtmlEntity() { + Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar")); + } +}