From 1694e9c78c16b829d0339fe46a4135d05bb2fa9a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 26 Dec 2023 16:21:40 +0100 Subject: [PATCH 1/3] (search) Add RSS Feeds to site info This change integrates the Feedlot RSS Bot with Marginalia's site info view to offer a preview of the latest updates. The change introduces a new tiny feature that is a feedlot-client based on Java's HttpClient. --- .../feedlot-client/build.gradle | 22 +++++ code/features-search/feedlot-client/readme.md | 20 ++++ .../nu/marginalia/feedlot/FeedlotClient.java | 58 ++++++++++++ .../nu/marginalia/feedlot/model/FeedItem.java | 12 +++ .../marginalia/feedlot/model/FeedItems.java | 6 ++ .../search-service/build.gradle | 1 + .../nu/marginalia/search/SearchModule.java | 15 +++ .../search/svc/SearchSiteInfoService.java | 33 ++++++- .../src/main/resources/static/search/rss.svg | 17 ++++ .../search/site-info/site-info-feed.hdb | 10 ++ .../search/site-info/site-info-summary.hdb | 91 ++++++++++--------- settings.gradle | 1 + 12 files changed, 237 insertions(+), 49 deletions(-) create mode 100644 code/features-search/feedlot-client/build.gradle create mode 100644 code/features-search/feedlot-client/readme.md create mode 100644 code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/FeedlotClient.java create mode 100644 code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java create mode 100644 code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItems.java create mode 100644 code/services-application/search-service/src/main/resources/static/search/rss.svg create mode 100644 code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb diff --git a/code/features-search/feedlot-client/build.gradle b/code/features-search/feedlot-client/build.gradle new file mode 100644 index 00000000..808c9ca6 --- /dev/null +++ b/code/features-search/feedlot-client/build.gradle @@ -0,0 +1,22 @@ +plugins { + id 'java' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation libs.bundles.slf4j + + implementation libs.notnull + implementation libs.gson + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} \ No newline at end of file diff --git a/code/features-search/feedlot-client/readme.md b/code/features-search/feedlot-client/readme.md new file mode 100644 index 00000000..76fafff8 --- /dev/null +++ b/code/features-search/feedlot-client/readme.md @@ -0,0 +1,20 @@ +Client for [FeedlotTheFeedBot](https://github.com/MarginaliaSearch/FeedLotTheFeedBot), +the RSS/Atom feed fetcher and cache for Marginalia Search. + +This service is external to the Marginalia Search codebase, +as it is not a core part of the search engine and has other +utilities. + +## Example + +```java + +import java.time.Duration; + +var client = new FeedlotClient("localhost", 8080, + gson, + Duration.ofMillis(100), // connect timeout + Duration.ofMillis(100)); // request timeout + +CompleteableFuture items = client.getFeedItems("www.marginalia.nu"); +``` \ No newline at end of file diff --git a/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/FeedlotClient.java b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/FeedlotClient.java new file mode 100644 index 00000000..3392a8d2 --- /dev/null +++ b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/FeedlotClient.java @@ -0,0 +1,58 @@ +package nu.marginalia.feedlot; + +import com.google.gson.Gson; +import nu.marginalia.feedlot.model.FeedItems; + +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import java.util.concurrent.Executors; +import java.util.concurrent.CompletableFuture; + +public class FeedlotClient { + private final String feedlotHost; + private final int feedlotPort; + private final Gson gson; + private final HttpClient httpClient; + private final Duration requestTimeout; + + public FeedlotClient(String feedlotHost, + int feedlotPort, + Gson gson, + Duration connectTimeout, + Duration requestTimeout + ) + { + this.feedlotHost = feedlotHost; + this.feedlotPort = feedlotPort; + this.gson = gson; + + httpClient = HttpClient.newBuilder() + .executor(Executors.newVirtualThreadPerTaskExecutor()) + .connectTimeout(connectTimeout) + .build(); + this.requestTimeout = requestTimeout; + } + + public CompletableFuture getFeedItems(String domainName) { + return httpClient.sendAsync( + HttpRequest.newBuilder() + .uri(URI.create("http://%s:%d/feed/%s".formatted(feedlotHost, feedlotPort, domainName))) + .GET() + .timeout(requestTimeout) + .build(), + HttpResponse.BodyHandlers.ofString() + ).thenApply(HttpResponse::body) + .thenApply(this::parseFeedItems); + } + + private FeedItems parseFeedItems(String s) { + return gson.fromJson(s, FeedItems.class); + } + + public void stop() { + httpClient.close(); + } +} diff --git a/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java new file mode 100644 index 00000000..549f6c06 --- /dev/null +++ b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java @@ -0,0 +1,12 @@ +package nu.marginalia.feedlot.model; + +public record FeedItem(String title, String date, String description, String url) { + + public String pubDay() { // Extract the date from an ISO style date string + if (date.length() > 10) { + return date.substring(0, 10); + } + return date; + } + +} diff --git a/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItems.java b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItems.java new file mode 100644 index 00000000..fcf06345 --- /dev/null +++ b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItems.java @@ -0,0 +1,6 @@ +package nu.marginalia.feedlot.model; + +import java.util.List; + +public record FeedItems(String domain, String feedUrl, String updated, List items) { +} diff --git a/code/services-application/search-service/build.gradle b/code/services-application/search-service/build.gradle index 805a7b34..ee504bcb 100644 --- a/code/services-application/search-service/build.gradle +++ b/code/services-application/search-service/build.gradle @@ -47,6 +47,7 @@ dependencies { implementation project(':code:features-search:screenshots') implementation project(':code:features-search:random-websites') + implementation project(':code:features-search:feedlot-client') implementation libs.bundles.slf4j diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchModule.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchModule.java index 090884ba..d832503c 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchModule.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchModule.java @@ -1,10 +1,15 @@ package nu.marginalia.search; import com.google.inject.AbstractModule; +import com.google.inject.Provides; import nu.marginalia.LanguageModels; import nu.marginalia.WebsiteUrl; import nu.marginalia.WmsaHome; +import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.renderer.config.HandlebarsConfigurator; +import nu.marginalia.feedlot.FeedlotClient; + +import java.time.Duration; public class SearchModule extends AbstractModule { @@ -17,4 +22,14 @@ public class SearchModule extends AbstractModule { System.getProperty("website-url", "https://search.marginalia.nu/"))); } + @Provides + public FeedlotClient provideFeedlotClient() { + return new FeedlotClient( + System.getProperty("ext-svc-feedlot-host", "feedlot"), + Integer.getInteger("ext-svc-feedlot-port", 80), + GsonFactory.get(), + Duration.ofMillis(250), + Duration.ofMillis(100) + ); + } } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index badaaeed..d62e4cb8 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -5,13 +5,17 @@ import nu.marginalia.assistant.client.AssistantClient; import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.client.Context; import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.feedlot.model.FeedItems; import nu.marginalia.model.EdgeDomain; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.search.SearchOperator; import nu.marginalia.assistant.client.model.DomainInformation; +import nu.marginalia.feedlot.FeedlotClient; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; @@ -21,19 +25,23 @@ import java.util.List; import java.util.Map; public class SearchSiteInfoService { + private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class); private final SearchOperator searchOperator; private final AssistantClient assistantClient; private final SearchFlagSiteService flagSiteService; private final DbDomainQueries domainQueries; private final MustacheRenderer renderer; + private final FeedlotClient feedlotClient; @Inject public SearchSiteInfoService(SearchOperator searchOperator, AssistantClient assistantClient, RendererFactory rendererFactory, SearchFlagSiteService flagSiteService, - DbDomainQueries domainQueries) throws IOException { + DbDomainQueries domainQueries, + FeedlotClient feedlotClient) throws IOException + { this.searchOperator = searchOperator; this.assistantClient = assistantClient; this.flagSiteService = flagSiteService; @@ -41,6 +49,7 @@ public class SearchSiteInfoService { this.renderer = rendererFactory.renderer("search/site-info/site-info"); + this.feedlotClient = feedlotClient; } public Object handle(Request request, Response response) throws SQLException { @@ -121,6 +130,8 @@ public class SearchSiteInfoService { final List linkingDomains; String url = "https://" + domainName + "/";; + var feedItemsFuture = feedlotClient.getFeedItems(domainName); + if (domainId < 0 || !assistantClient.isAccepting()) { domainInfo = createDummySiteInfo(domainName); similarSet = List.of(); @@ -141,12 +152,20 @@ public class SearchSiteInfoService { } } + FeedItems feedItems = null; + try { + feedItems = feedItemsFuture.get(); + } catch (Exception e) { + logger.debug("Failed to get feed items for {}: {}", domainName, e.getMessage()); + } + return new SiteInfoWithContext(domainName, domainId, url, domainInfo, similarSet, - linkingDomains + linkingDomains, + feedItems ); } @@ -200,13 +219,16 @@ public class SearchSiteInfoService { String siteUrl, DomainInformation domainInformation, List similar, - List linking) { + List linking, + FeedItems feed + ) { public SiteInfoWithContext(String domain, long domainId, String siteUrl, DomainInformation domainInformation, List similar, - List linking + List linking, + FeedItems feedInfo ) { this(Map.of("info", true), @@ -216,7 +238,8 @@ public class SearchSiteInfoService { siteUrl, domainInformation, similar, - linking); + linking, + feedInfo); } public String getLayout() { diff --git a/code/services-application/search-service/src/main/resources/static/search/rss.svg b/code/services-application/search-service/src/main/resources/static/search/rss.svg new file mode 100644 index 00000000..2c01c8b3 --- /dev/null +++ b/code/services-application/search-service/src/main/resources/static/search/rss.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb new file mode 100644 index 00000000..f51a7444 --- /dev/null +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb @@ -0,0 +1,10 @@ +{{#with feed}} +

Feed (Experimental)

+ +
+ {{#each items}} +
{{title}}
+
{{pubDay}}
{{{description}}}
+ {{/each}} +
+{{/with}} \ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-summary.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-summary.hdb index fd1c7590..fba7adad 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-summary.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-summary.hdb @@ -12,11 +12,58 @@ Screenshot of {{domain}} {{#with domainInformation}} + {{> search/site-info/site-info-feed}} {{> search/site-info/site-info-index}} {{> search/site-info/site-info-links}} {{/with}} + {{#if linking}} + + {{/if}} + + {{#if similar}}

Similar Domains

@@ -67,48 +114,4 @@
{{/if}} - {{#if linking}} - - {{/if}} \ No newline at end of file diff --git a/settings.gradle b/settings.gradle index 42ae0f47..dbc0c855 100644 --- a/settings.gradle +++ b/settings.gradle @@ -28,6 +28,7 @@ include 'code:libraries:message-queue' include 'code:features-search:screenshots' include 'code:features-search:random-websites' +include 'code:features-search:feedlot-client' include 'code:features-qs:query-parser' include 'code:features-index:result-ranking' From 5d1b7da728747ff762ce22772a240558decefccf Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 26 Dec 2023 22:06:01 +0100 Subject: [PATCH 2/3] Updated site info feed and search service Modified site info feed template to secure the description field against injected code. Also adjusted search service by extracting samples within the correct scope and including them in the returned site info. This improves the quality and security of the displayed information. --- .../nu/marginalia/feedlot/model/FeedItem.java | 5 +++++ .../search/svc/SearchSiteInfoService.java | 21 +++++++++++-------- .../search/site-info/site-info-feed.hdb | 14 +++++++++++-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java index 549f6c06..95ea8fe3 100644 --- a/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java +++ b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java @@ -9,4 +9,9 @@ public record FeedItem(String title, String date, String description, String url return date; } + public String descriptionSafe() { + return description + .replace("<", "<") + .replace(">", ">"); + } } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index d62e4cb8..c9cb4ec7 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -131,7 +131,6 @@ public class SearchSiteInfoService { String url = "https://" + domainName + "/";; var feedItemsFuture = feedlotClient.getFeedItems(domainName); - if (domainId < 0 || !assistantClient.isAccepting()) { domainInfo = createDummySiteInfo(domainName); similarSet = List.of(); @@ -145,11 +144,11 @@ public class SearchSiteInfoService { linkingDomains = assistantClient .linkedDomains(ctx, domainId, 100) .blockingFirst(); + } - List sampleResults = searchOperator.doSiteSearch(ctx, domainName, 1); - if (!sampleResults.isEmpty()) { - url = sampleResults.getFirst().url.withPathAndParam("/", null).toString(); - } + List sampleResults = searchOperator.doSiteSearch(ctx, domainName, 5); + if (!sampleResults.isEmpty()) { + url = sampleResults.getFirst().url.withPathAndParam("/", null).toString(); } FeedItems feedItems = null; @@ -165,7 +164,8 @@ public class SearchSiteInfoService { domainInfo, similarSet, linkingDomains, - feedItems + feedItems, + sampleResults ); } @@ -220,7 +220,8 @@ public class SearchSiteInfoService { DomainInformation domainInformation, List similar, List linking, - FeedItems feed + FeedItems feed, + List samples ) { public SiteInfoWithContext(String domain, long domainId, @@ -228,7 +229,8 @@ public class SearchSiteInfoService { DomainInformation domainInformation, List similar, List linking, - FeedItems feedInfo + FeedItems feedInfo, + List samples ) { this(Map.of("info", true), @@ -239,7 +241,8 @@ public class SearchSiteInfoService { domainInformation, similar, linking, - feedInfo); + feedInfo, + samples); } public String getLayout() { diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb index f51a7444..f458e380 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb @@ -4,7 +4,17 @@
{{#each items}}
{{title}}
-
{{pubDay}}
{{{description}}}
+
{{pubDay}}
{{{descriptionSafe}}}
{{/each}}
-{{/with}} \ No newline at end of file +{{/with}} + +{{#unless feed}}{{#if samples}} +

Sample

+
+{{#each samples}} +
{{title}}
+
{{{description}}}
+{{/each}} +
+{{/if}}{{/unless}} \ No newline at end of file From c7af40c368a454d213ed031a1212e76b95b9302c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Dec 2023 13:16:10 +0100 Subject: [PATCH 3/3] (search) Change layout balance when feeds/samples are present --- .../nu/marginalia/search/svc/SearchSiteInfoService.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index c9cb4ec7..290bef50 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -250,6 +250,12 @@ public class SearchSiteInfoService { if (similar.size() < 25) { return "lopsided"; } + else if (!feed.items().isEmpty()) { + return "lopsided"; + } + else if (!samples.isEmpty()) { + return "lopsided"; + } else { return "balanced"; }