Yet more restructuring. Improved search result ranking.

This commit is contained in:
Viktor Lofgren 2023-03-16 21:35:54 +01:00
parent 5ef17a2a20
commit 449471a076
471 changed files with 19834 additions and 1088 deletions

View File

@ -28,7 +28,8 @@ dependencies {
implementation libs.guice
implementation libs.rxjava
implementation libs.protobuf
implementation libs.gson
implementation libs.bundles.gson
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit

View File

@ -30,9 +30,9 @@ public class IndexClient extends AbstractDynamicClient {
}
@CheckReturnValue
public List<SearchResultItem> query(Context ctx, SearchSpecification specs) {
public SearchResultSet query(Context ctx, SearchSpecification specs) {
return wmsa_search_index_api_time.time(
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst().getResults()
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst()
);
}

View File

@ -15,14 +15,14 @@ public class SearchResultItem {
public final long combinedId;
/** How did the subqueries match against the document ? */
public final List<SearchResultKeywordScore> scores;
public final List<SearchResultKeywordScore> keywordScores;
/** How many other potential results existed in the same domain */
public int resultsFromDomain;
public SearchResultItem(long val) {
this.combinedId = val;
this.scores = new ArrayList<>(16);
this.keywordScores = new ArrayList<>(16);
}
public EdgeId<EdgeUrl> getUrlId() {
@ -37,11 +37,11 @@ public class SearchResultItem {
}
/* Used for evaluation */
private transient double scoreValue = 1;
public void setScore(double score) {
private transient SearchResultPreliminaryScore scoreValue = null;
public void setScore(SearchResultPreliminaryScore score) {
scoreValue = score;
}
public double getScore() {
public SearchResultPreliminaryScore getScore() {
return scoreValue;
}

View File

@ -26,68 +26,17 @@ public final class SearchResultKeywordScore {
this.hasPriorityTerms = hasPriorityTerms;
}
private boolean hasTermFlag(WordFlags flag) {
public boolean hasTermFlag(WordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
}
public double documentValue() {
long sum = 0;
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
if (DocumentMetadata.hasFlags(encodedDocMetadata, DocumentFlags.Simple.asBit())) {
sum += 20;
public int positionCount() {
return Integer.bitCount(positions());
}
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
if (rank < 0)
sum += rank / 2;
else
sum += rank / 4;
return sum;
public int tfIdf() {
return (int) WordMetadata.decodeTfidf(encodedWordMetadata);
}
public double termValue() {
double sum = 0;
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
if (hasTermFlag(WordFlags.Title)) {
sum -= 15;
}
if (hasTermFlag(WordFlags.Site) && positionBits != 0) {
sum -= 10;
} else if (hasTermFlag(WordFlags.SiteAdjacent) && positionBits != 0) {
sum -= 5;
}
if (hasTermFlag(WordFlags.Subjects)) {
sum -= 10;
}
if (hasTermFlag(WordFlags.NamesWords)) {
sum -= 1;
}
if (hasTermFlag(WordFlags.UrlDomain)) {
sum -= 5;
}
if (hasTermFlag(WordFlags.UrlPath)) {
sum -= 5;
}
sum -= tfIdf / 10.;
sum -= Integer.bitCount(positionBits) / 3.;
return sum;
}
public int subquery() {
return subquery;
}
@ -138,8 +87,8 @@ public final class SearchResultKeywordScore {
return "SearchResultKeywordScore[" +
"set=" + subquery + ", " +
"keyword=" + keyword + ", " +
"encodedWordMetadata=" + encodedWordMetadata + ", " +
"encodedDocMetadata=" + encodedDocMetadata + ", " +
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
"encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ", " +
"hasPriorityTerms=" + hasPriorityTerms + ']';
}

View File

@ -0,0 +1,42 @@
package nu.marginalia.index.client.model.results;
import org.jetbrains.annotations.NotNull;
import static java.lang.Boolean.compare;
import static java.lang.Integer.compare;
public record SearchResultPreliminaryScore(boolean hasSingleTermMatch,
boolean hasPriorityTerm,
int minNumberOfFlagsSet,
int minNumberOfPositions,
int overlappingPositions)
implements Comparable<SearchResultPreliminaryScore>
{
@Override
public int compareTo(@NotNull SearchResultPreliminaryScore other) {
int diff;
diff = compare(hasSingleTermMatch, other.hasSingleTermMatch);
if (diff != 0) return diff;
diff = compare(minNumberOfFlagsSet, other.minNumberOfFlagsSet);
if (diff != 0) return diff;
diff = compare(hasPriorityTerm, other.hasPriorityTerm);
if (diff != 0) return diff;
diff = compare(overlappingPositions, other.overlappingPositions);
if (diff != 0) return diff;
return compare(minNumberOfPositions, other.minNumberOfPositions);
}
public boolean isGreat() {
return hasSingleTermMatch || (minNumberOfFlagsSet >= 1 && overlappingPositions >= 1);
}
public boolean isEmpty() {
return minNumberOfFlagsSet == 0
&& minNumberOfPositions == 0
&& overlappingPositions == 0;
}
}

View File

@ -0,0 +1,25 @@
package nu.marginalia.index.client.model.results;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import lombok.ToString;
import java.util.Map;
@ToString
public class SearchResultRankingContext {
private final int docCount;
private final Object2IntOpenHashMap<String> termCounts = new Object2IntOpenHashMap<>(10, 0.5f);
public SearchResultRankingContext(int docCount, Map<String, Integer> termCounts) {
this.docCount = docCount;
this.termCounts.putAll(termCounts);
}
public int termFreqDocCount() {
return docCount;
}
public int frequency(String keyword) {
return termCounts.getOrDefault(keyword, 1);
}
}

View File

@ -9,7 +9,7 @@ import java.util.List;
@AllArgsConstructor @Getter @ToString
public class SearchResultSet {
public List<SearchResultItem> results;
public SearchResultRankingContext rankingContext;
public int size() {
return results.size();
}

View File

@ -0,0 +1,32 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation libs.notnull
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
testImplementation libs.bundles.slf4j.test
implementation libs.guava
implementation libs.guice
implementation libs.commons.lang3
implementation libs.snakeyaml
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -0,0 +1,4 @@
# Process
Basic functionality for a Process. Processes must include this dependency to ensure
their loggers are configured properly!

View File

@ -1,4 +1,4 @@
package nu.marginalia.work_log;
package nu.marginalia.process.log;
import com.google.errorprone.annotations.MustBeClosed;
import org.apache.logging.log4j.util.Strings;

View File

@ -1,4 +1,4 @@
package nu.marginalia.work_log;
package nu.marginalia.process.log;
public record WorkLogEntry(String id, String ts, String path, int cnt) {
}

View File

@ -0,0 +1,9 @@
log4j2.isThreadContextMapInheritable=true
status = info
appender.console.type = Console
appender.console.name = LogToConsole
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n
appender.console.filter.http.type = MarkerFilter
rootLogger.level = info
rootLogger.appenderRef.console.ref = LogToConsole

View File

@ -6,7 +6,7 @@ functions based on [POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling0
## Central Classes
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java)
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java)
## See Also

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction;
package nu.marginalia.keyword;
import nu.marginalia.keyword_extraction.extractors.*;
import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.model.DocumentLanguageData;
@ -73,6 +73,8 @@ public class DocumentKeywordExtractor {
}
}
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData documentLanguageData)
@ -88,7 +90,7 @@ public class DocumentKeywordExtractor {
}
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
if (matchesWordPattern(w)) {
wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed()));
}
}
@ -101,4 +103,43 @@ public class DocumentKeywordExtractor {
}
}
}
boolean matchesWordPattern(String s) {
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
String wordPartSeparator = ".-_/:+*";
int i = 0;
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
if (i == 0)
return false;
for (int j = 0; j < 5; j++) {
if (i == s.length()) return true;
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
return false;
}
i++;
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
}
return false;
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction;
package nu.marginalia.keyword;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentSentence;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction;
package nu.marginalia.keyword;
import lombok.Builder;
import nu.marginalia.keyword_extraction.extractors.*;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.WordFlags;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction;
package nu.marginalia.keyword;
import nu.marginalia.language.model.WordRep;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import nu.marginalia.language.model.DocumentLanguageData;

View File

@ -1,8 +1,8 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import com.google.inject.Inject;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
/** Generates a position bitmask for each word in a document */

View File

@ -1,13 +1,13 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import com.google.common.base.CharMatcher;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.WordReps;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.keyword.KeywordExtractor;
import java.util.*;
import java.util.stream.Collectors;

View File

@ -1,12 +1,12 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.WordReps;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.model.WordSeparator;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import org.apache.commons.lang3.StringUtils;
import java.util.*;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import nu.marginalia.keyword_extraction.WordReps;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.EdgeDomain;

View File

@ -1,12 +1,12 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.WordReps;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.model;
package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.WordMetadata;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.model;
package nu.marginalia.keyword.model;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter;

View File

@ -0,0 +1,24 @@
package nu.marginalia.keyword;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
class DocumentKeywordExtractorTest {
@Test
public void testWordPattern() {
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(null);
Assertions.assertTrue(extractor.matchesWordPattern("test"));
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
Assertions.assertTrue(extractor.matchesWordPattern("c++"));
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
}
}

View File

@ -1,8 +1,7 @@
package nu.marginalia.keyword_extraction;
package nu.marginalia.keyword;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.sentence.SentenceExtractor;
@ -106,10 +105,6 @@ class SentenceExtractorTest {
}
@Test
public void testPattern() {
System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
}
@SneakyThrows
@Test

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.junit.jupiter.api.Test;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.extractors;
package nu.marginalia.keyword.extractors;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.EdgeUrl;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.logic.pubdate;
package nu.marginalia.pubdate;
import nu.marginalia.model.crawl.PubDate;
import org.junit.jupiter.api.Test;

View File

@ -9,3 +9,4 @@
* [adblock](adblock/) - Simulates Adblock
* [pubdate](pubdate/) - Determines when a document was published
* [topic-detection](topic-detection/) - Tries to identify the topic of a website
* [summary-extraction](summary-extraction/)

View File

@ -1,7 +1,7 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id 'application'
id 'jvm-test-suite'
}
@ -11,26 +11,28 @@ java {
}
}
application {
mainClass = 'nu.marginalia.converting.ConverterMain'
applicationName = 'converter-process'
}
tasks.distZip.enabled = false
dependencies {
implementation libs.notnull
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.gson
implementation libs.rxjava
implementation libs.bundles.slf4j
testImplementation libs.bundles.slf4j.test
implementation libs.guava
implementation libs.guice
implementation libs.notnull
implementation libs.snakeyaml
implementation libs.jsoup
implementation libs.zstd
implementation libs.commons.net
implementation libs.opencsv
implementation libs.guice
implementation libs.guava
implementation libs.bundles.gson
implementation libs.trove
implementation libs.fastutil
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
@ -38,6 +40,7 @@ dependencies {
}
test {
maxHeapSize = "8G"
useJUnitPlatform()
}

View File

@ -0,0 +1,17 @@
# Summary Extraction
This feature attempts to find a descriptive passage of text that summarizes
what a search result "is about". It's the text you see below a search result.
It uses several naive heuristics to try to find something that makes sense,
and there is probably room for improvement.
There are many good techniques for doing this, but they've sadly not proved
particularly fast. Whatever solution is used needs to be able to summarize of
order of a 100,000,000 documents with a time budget of a couple of hours.
## Central Classes
* [SummaryExtractor](src/main/java/nu/marginalia/summary/SummaryExtractor.java)
* [SummaryExtractionFilter](src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java) - DOM pruning algo.
Doesn't always work, but when it works it's pretty good.

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic.summary;
package nu.marginalia.summary;
import com.google.common.base.Strings;
import org.apache.commons.lang3.StringUtils;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic.summary;
package nu.marginalia.summary;
import com.google.inject.Inject;
import com.google.inject.name.Named;
@ -19,9 +19,8 @@ public class SummaryExtractor {
}
public String extractSummary(Document parsed) {
String summaryString;
String summaryString = extractSummaryRaw(parsed);
summaryString = extractSummaryRaw(parsed);
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
@ -81,7 +80,7 @@ public class SummaryExtractor {
}
if (content.length() > 32) {
// AAAA AAAA AAAA AAAA AAAA AAAA AAAA AAAA
// AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
return content.toString();
}

View File

@ -1,17 +1,13 @@
package nu.marginalia.converting.logic;
package nu.marginalia.summary;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter;
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
import nu.marginalia.summary.SummaryExtractionFilter;
import nu.marginalia.summary.SummaryExtractor;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Comparator;
import java.util.HashMap;
@ -43,47 +39,6 @@ class SummaryExtractorTest {
System.out.println(e.getValue().text());
});
}
@Test
public void testSummaryFilter3() throws IOException {
var data = WmsaHome.getHomePath().resolve("test-data/url-327999153");
String html = Files.readString(data);
var doc = Jsoup.parse(html);
var filter = new SummaryExtractionFilter();
doc.filter(filter);
filter.getSummary(255);
}
@Test
public void testSummaryFilter2() throws IOException {
var data = WmsaHome.getHomePath().resolve("test-data/");
System.out.println("Running");
var fos = new PrintWriter(new FileOutputStream("/tmp/summaryDiff.html"));
fos.println("<table>");
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
var doc = Jsoup.parse(Files.readString(file.toPath()));
fos.println("<tr><th colspan=2>" + file.getName() + "</th></tr>");
fos.println("<tr><td width=50%>");
var filter = new SummaryExtractionFilter();
doc.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
doc.filter(filter);
var ret = filter.getSummary(255);
fos.println(ret);
fos.println("</td><td width=50%>");
String summary = summaryExtractor.extractSummary(Jsoup.parse(Files.readString(file.toPath())));
fos.println(summary);
fos.println("</td></tr>");
}
fos.println("</table>");
fos.flush();
}
@Test
void extractSurrey() throws IOException {

Some files were not shown because too many files have changed in this diff Show More