(sideload) Add special handling for sideloaded wiki documents

This update enhances the SideloaderProcessing and DocumentClass modules to specially handle sideloaded wiki documents. Wiki content is generally truncated to the first paragraph, which generally tends to be too short to be included independently. An additional DocumentClass (SIDELOAD) has been introduced to suppress the length check in this case.
This commit is contained in:
Viktor Lofgren 2024-02-02 21:22:07 +01:00
parent 785d8deadd
commit fa145f632b
2 changed files with 17 additions and 3 deletions

View File

@ -6,10 +6,16 @@ package nu.marginalia.converting.processor;
public enum DocumentClass {
NORMAL,
EXTERNALLY_LINKED_ONCE,
EXTERNALLY_LINKED_MULTI;
EXTERNALLY_LINKED_MULTI,
/** A document that is not linked to, but is sideloaded. Ignore most inclusion checks. */
SIDELOAD;
public boolean enforceQualityLimits() {
return this != EXTERNALLY_LINKED_MULTI;
if (this == SIDELOAD)
return false;
if (this == EXTERNALLY_LINKED_MULTI)
return false;
return true;
}
/** This factor is multiplied onto the length of the document
@ -20,6 +26,7 @@ public enum DocumentClass {
case NORMAL -> 1.0;
case EXTERNALLY_LINKED_ONCE -> 2.;
case EXTERNALLY_LINKED_MULTI -> 10.;
case SIDELOAD -> 25.;
};
}
}

View File

@ -56,9 +56,16 @@ public class SideloaderProcessing {
null
);
// Give the document processing preferential treatment if this is a sideloaded wiki, since we
// truncate the document to the first paragraph, which typically is too short to be included
// on its own.
final DocumentClass documentClass;
if (type == GeneratorType.WIKI) documentClass = DocumentClass.SIDELOAD;
else documentClass = DocumentClass.NORMAL;
var ret = new ProcessedDocument();
try {
var details = htmlProcessorPlugin.createDetails(crawledDoc, DocumentClass.NORMAL);
var details = htmlProcessorPlugin.createDetails(crawledDoc, documentClass);
ret.words = details.words();