From faa50bf5786f31df2fbbe083e842a61d4e36e325 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 16:19:38 +0100 Subject: [PATCH] (sideload) Just index based on first paragraph This seems like it would make the wikipedia search result worse, but it drastically improves the result quality! This is because wikipedia has a lot of articles that each talk about a lot of irrelevant concepts, and indexing the entire document means tangentially relevant results tend to displace the most relevant results. --- .../encyclopedia/EncyclopediaMarginaliaNuSideloader.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index f0686b4c..961e1c79 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -120,6 +120,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC fullHtml.append("

"); fullHtml.append(part); fullHtml.append("

"); + break; // Only take the first part, this improves accuracy a lot } fullHtml.append("");