mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(sideload) Add special handling for sideloaded wiki documents
This update enhances the SideloaderProcessing and DocumentClass modules to specially handle sideloaded wiki documents. Wiki content is generally truncated to the first paragraph, which generally tends to be too short to be included independently. An additional DocumentClass (SIDELOAD) has been introduced to suppress the length check in this case.
This commit is contained in:
parent
785d8deadd
commit
fa145f632b
@ -6,10 +6,16 @@ package nu.marginalia.converting.processor;
|
|||||||
public enum DocumentClass {
|
public enum DocumentClass {
|
||||||
NORMAL,
|
NORMAL,
|
||||||
EXTERNALLY_LINKED_ONCE,
|
EXTERNALLY_LINKED_ONCE,
|
||||||
EXTERNALLY_LINKED_MULTI;
|
EXTERNALLY_LINKED_MULTI,
|
||||||
|
/** A document that is not linked to, but is sideloaded. Ignore most inclusion checks. */
|
||||||
|
SIDELOAD;
|
||||||
|
|
||||||
public boolean enforceQualityLimits() {
|
public boolean enforceQualityLimits() {
|
||||||
return this != EXTERNALLY_LINKED_MULTI;
|
if (this == SIDELOAD)
|
||||||
|
return false;
|
||||||
|
if (this == EXTERNALLY_LINKED_MULTI)
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** This factor is multiplied onto the length of the document
|
/** This factor is multiplied onto the length of the document
|
||||||
@ -20,6 +26,7 @@ public enum DocumentClass {
|
|||||||
case NORMAL -> 1.0;
|
case NORMAL -> 1.0;
|
||||||
case EXTERNALLY_LINKED_ONCE -> 2.;
|
case EXTERNALLY_LINKED_ONCE -> 2.;
|
||||||
case EXTERNALLY_LINKED_MULTI -> 10.;
|
case EXTERNALLY_LINKED_MULTI -> 10.;
|
||||||
|
case SIDELOAD -> 25.;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -56,9 +56,16 @@ public class SideloaderProcessing {
|
|||||||
null
|
null
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Give the document processing preferential treatment if this is a sideloaded wiki, since we
|
||||||
|
// truncate the document to the first paragraph, which typically is too short to be included
|
||||||
|
// on its own.
|
||||||
|
final DocumentClass documentClass;
|
||||||
|
if (type == GeneratorType.WIKI) documentClass = DocumentClass.SIDELOAD;
|
||||||
|
else documentClass = DocumentClass.NORMAL;
|
||||||
|
|
||||||
var ret = new ProcessedDocument();
|
var ret = new ProcessedDocument();
|
||||||
try {
|
try {
|
||||||
var details = htmlProcessorPlugin.createDetails(crawledDoc, DocumentClass.NORMAL);
|
var details = htmlProcessorPlugin.createDetails(crawledDoc, documentClass);
|
||||||
|
|
||||||
ret.words = details.words();
|
ret.words = details.words();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user