From fa145f632ba40958de4c8e2b0458848e528e46a4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 2 Feb 2024 21:22:07 +0100 Subject: [PATCH] (sideload) Add special handling for sideloaded wiki documents This update enhances the SideloaderProcessing and DocumentClass modules to specially handle sideloaded wiki documents. Wiki content is generally truncated to the first paragraph, which generally tends to be too short to be included independently. An additional DocumentClass (SIDELOAD) has been introduced to suppress the length check in this case. --- .../converting/processor/DocumentClass.java | 11 +++++++++-- .../converting/sideload/SideloaderProcessing.java | 9 ++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java index ab450a2a..408d3105 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java @@ -6,10 +6,16 @@ package nu.marginalia.converting.processor; public enum DocumentClass { NORMAL, EXTERNALLY_LINKED_ONCE, - EXTERNALLY_LINKED_MULTI; + EXTERNALLY_LINKED_MULTI, + /** A document that is not linked to, but is sideloaded. Ignore most inclusion checks. */ + SIDELOAD; public boolean enforceQualityLimits() { - return this != EXTERNALLY_LINKED_MULTI; + if (this == SIDELOAD) + return false; + if (this == EXTERNALLY_LINKED_MULTI) + return false; + return true; } /** This factor is multiplied onto the length of the document @@ -20,6 +26,7 @@ public enum DocumentClass { case NORMAL -> 1.0; case EXTERNALLY_LINKED_ONCE -> 2.; case EXTERNALLY_LINKED_MULTI -> 10.; + case SIDELOAD -> 25.; }; } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 14b35b6a..f888a6b0 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -56,9 +56,16 @@ public class SideloaderProcessing { null ); + // Give the document processing preferential treatment if this is a sideloaded wiki, since we + // truncate the document to the first paragraph, which typically is too short to be included + // on its own. + final DocumentClass documentClass; + if (type == GeneratorType.WIKI) documentClass = DocumentClass.SIDELOAD; + else documentClass = DocumentClass.NORMAL; + var ret = new ProcessedDocument(); try { - var details = htmlProcessorPlugin.createDetails(crawledDoc, DocumentClass.NORMAL); + var details = htmlProcessorPlugin.createDetails(crawledDoc, documentClass); ret.words = details.words();