mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Move list-conversion into getDescription method.
This commit is contained in:
parent
88399e30e2
commit
44b1fe0e6d
@ -135,9 +135,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||||
|
|
||||||
ret.description = getDescription(doc,
|
ret.description = getDescription(doc, words.importantWords);
|
||||||
new ArrayList<>(words.importantWords)
|
|
||||||
);
|
|
||||||
|
|
||||||
var tagWords = new MetaTagsBuilder()
|
var tagWords = new MetaTagsBuilder()
|
||||||
.addDomainCrawlData(crawledDomain)
|
.addDomainCrawlData(crawledDomain)
|
||||||
@ -270,11 +268,20 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
}
|
}
|
||||||
|
|
||||||
private String getDescription(Document doc,
|
private String getDescription(Document doc,
|
||||||
Collection<String> importantWords)
|
Set<String> importantWords)
|
||||||
{
|
{
|
||||||
importantWords.removeIf(w -> w.contains("_"));
|
List<String> cleanedWords = new ArrayList<>(importantWords.size());
|
||||||
|
|
||||||
return summaryExtractor.extractSummary(doc, importantWords);
|
for (var word : importantWords) {
|
||||||
|
// summary extraction is not interested in n-grams
|
||||||
|
if (word.contains("_")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanedWords.add(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
return summaryExtractor.extractSummary(doc, cleanedWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getLength(Document doc) {
|
private int getLength(Document doc) {
|
||||||
|
Loading…
Reference in New Issue
Block a user