mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Better logic for summarization.
This commit is contained in:
parent
67c15a34e6
commit
f0b4acb358
@ -8,7 +8,9 @@ import org.jsoup.nodes.TextNode;
|
|||||||
import org.jsoup.select.NodeFilter;
|
import org.jsoup.select.NodeFilter;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import static nu.marginalia.summary.heuristic.HeuristicTextUtil.countOccurrencesOfAnyWord;
|
||||||
import static org.jsoup.internal.StringUtil.isActuallyWhitespace;
|
import static org.jsoup.internal.StringUtil.isActuallyWhitespace;
|
||||||
import static org.jsoup.internal.StringUtil.isInvisibleChar;
|
import static org.jsoup.internal.StringUtil.isInvisibleChar;
|
||||||
|
|
||||||
@ -57,9 +59,9 @@ public class SummarizingDOMFilter implements NodeFilter {
|
|||||||
public String getSummary(int maxLength, Collection<String> importantWords) {
|
public String getSummary(int maxLength, Collection<String> importantWords) {
|
||||||
List<NodeStatistics> ret = new ArrayList<>(statistics.size());
|
List<NodeStatistics> ret = new ArrayList<>(statistics.size());
|
||||||
for (var stats : statistics.values()) {
|
for (var stats : statistics.values()) {
|
||||||
if (stats.textToTagRatio() < 0.85) continue;
|
if (stats.textToTagRatio() < 0.75) continue;
|
||||||
if (!stats.isElement() || !stats.isAppropriateTagType()) continue;
|
if (!stats.isElement() || !stats.isAppropriateTagType()) continue;
|
||||||
if (stats.textLength() < 128) continue;
|
if (stats.textLength() < 64) continue;
|
||||||
if (stats.isLink()) continue;
|
if (stats.isLink()) continue;
|
||||||
|
|
||||||
ret.add(stats);
|
ret.add(stats);
|
||||||
@ -81,38 +83,40 @@ public class SummarizingDOMFilter implements NodeFilter {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<NodeStatistics> sortByWordRelevance(List<NodeStatistics> in,
|
|
||||||
Collection<String> words) {
|
|
||||||
|
|
||||||
if (words.isEmpty())
|
// Words we don't want to appear in the summary
|
||||||
|
private static List<String> badWords = List.of("copyright", "rights", "reserved", "post",
|
||||||
|
"posted", "author", "published", "publish", "cookie", "cookies", "©", "terms", "conditions");
|
||||||
|
|
||||||
|
private List<NodeStatistics> sortByWordRelevance(List<NodeStatistics> in,
|
||||||
|
Collection<String> importantWords) {
|
||||||
|
|
||||||
|
if (importantWords.isEmpty())
|
||||||
return in;
|
return in;
|
||||||
|
|
||||||
Map<NodeStatistics, Integer> ret = new HashMap<>(in.size());
|
Map<NodeStatistics, Integer> ret = new HashMap<>(in.size());
|
||||||
int cntTotal = 0;
|
|
||||||
|
|
||||||
// This is a relatively small list at this point
|
// This is a relatively small list at this point
|
||||||
// so this function isn't as bad as it looks
|
// so this function isn't as bad as it looks
|
||||||
|
|
||||||
for (var stats : in) {
|
for (var stats : in) {
|
||||||
var lcText = stats.text().toLowerCase();
|
// text() is expensive, we don't mind sifting through superfluous whitespace
|
||||||
|
int cnt = stats.score(tn ->
|
||||||
int cnt = 0;
|
countOccurrencesOfAnyWord(tn.getWholeText(), importantWords)
|
||||||
for (var word : words) {
|
- countOccurrencesOfAnyWord(tn.getWholeText(), badWords));
|
||||||
if (lcText.contains(word)) {
|
|
||||||
cnt++;
|
|
||||||
cntTotal++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (cnt > 0) {
|
||||||
ret.put(stats, -cnt);
|
ret.put(stats, -cnt);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Skip the sorting if we didn't match any words
|
// Skip the sorting if we didn't match any importantWords
|
||||||
if (cntTotal == 0) {
|
if (ret.isEmpty()) {
|
||||||
return in;
|
return in;
|
||||||
}
|
}
|
||||||
|
|
||||||
in.sort(Comparator.comparing(ret::get));
|
in.sort(Comparator.comparing(w -> ret.getOrDefault(w, 0)));
|
||||||
|
|
||||||
return in;
|
return in;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -216,6 +220,27 @@ public class SummarizingDOMFilter implements NodeFilter {
|
|||||||
}
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
public String wholeText() {
|
||||||
|
if (node instanceof Element e) {
|
||||||
|
return e.wholeText();
|
||||||
|
}
|
||||||
|
else if (node instanceof TextNode tn) {
|
||||||
|
return tn.getWholeText();
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
public int score(Function<TextNode, Integer> fn) {
|
||||||
|
int[] score = new int[1];
|
||||||
|
|
||||||
|
node.traverse((node, depth) -> {
|
||||||
|
if (node instanceof TextNode tn) {
|
||||||
|
score[0] += fn.apply(tn);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return score[0];
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isElement() {
|
public boolean isElement() {
|
||||||
return node instanceof Element;
|
return node instanceof Element;
|
||||||
|
Loading…
Reference in New Issue
Block a user