mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(feed) Sanitize illegal HTML entities out of the feed XML before parsing
This commit is contained in:
parent
94d4d2edb7
commit
41a59dcf45
@ -316,6 +316,8 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||||
try {
|
try {
|
||||||
|
feedData = sanitizeEntities(feedData);
|
||||||
|
|
||||||
List<Item> rawItems = rssReader.read(
|
List<Item> rawItems = rssReader.read(
|
||||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
||||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
||||||
@ -342,6 +344,32 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
||||||
|
"»", "»",
|
||||||
|
"«", "«",
|
||||||
|
"—", "--",
|
||||||
|
"–", "-",
|
||||||
|
"’", "'",
|
||||||
|
"‘", "'",
|
||||||
|
" ", ""
|
||||||
|
);
|
||||||
|
|
||||||
|
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
||||||
|
* which is unfortunately relatively common. Replace them as far as is possible
|
||||||
|
* with their corresponding characters
|
||||||
|
*/
|
||||||
|
static String sanitizeEntities(String feedData) {
|
||||||
|
String result = feedData;
|
||||||
|
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
||||||
|
result = result.replace(entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle lone ampersands not part of a recognized XML entity
|
||||||
|
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
/** Decide whether to keep URI fragments in the feed items.
|
/** Decide whether to keep URI fragments in the feed items.
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* We keep fragments if there are multiple different fragments in the items.
|
* We keep fragments if there are multiple different fragments in the items.
|
||||||
|
@ -99,7 +99,9 @@ class FeedFetcherServiceTest extends AbstractModule {
|
|||||||
feedFetcherService.setDeterministic();
|
feedFetcherService.setDeterministic();
|
||||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||||
|
|
||||||
Assertions.assertFalse(feedDb.getFeed(new EdgeDomain("www.marginalia.nu")).isEmpty());
|
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
System.out.println(result);
|
||||||
|
Assertions.assertFalse(result.isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Tag("flaky")
|
@Tag("flaky")
|
||||||
|
@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
public class TestXmlSanitization {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPreservedEntities() {
|
||||||
|
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
||||||
|
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
||||||
|
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
||||||
|
Assertions.assertEquals(""", FeedFetcherService.sanitizeEntities("""));
|
||||||
|
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testStrayAmpersand() {
|
||||||
|
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTranslatedHtmlEntity() {
|
||||||
|
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user