Yet more restructuring. Improved search result ranking.

This commit is contained in:
Viktor Lofgren 2023-03-16 21:35:54 +01:00
parent 5ef17a2a20
commit 449471a076
471 changed files with 19834 additions and 1088 deletions

View File

@ -28,7 +28,8 @@ dependencies {
implementation libs.guice implementation libs.guice
implementation libs.rxjava implementation libs.rxjava
implementation libs.protobuf implementation libs.protobuf
implementation libs.gson implementation libs.bundles.gson
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit

View File

@ -30,9 +30,9 @@ public class IndexClient extends AbstractDynamicClient {
} }
@CheckReturnValue @CheckReturnValue
public List<SearchResultItem> query(Context ctx, SearchSpecification specs) { public SearchResultSet query(Context ctx, SearchSpecification specs) {
return wmsa_search_index_api_time.time( return wmsa_search_index_api_time.time(
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst().getResults() () -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst()
); );
} }

View File

@ -15,14 +15,14 @@ public class SearchResultItem {
public final long combinedId; public final long combinedId;
/** How did the subqueries match against the document ? */ /** How did the subqueries match against the document ? */
public final List<SearchResultKeywordScore> scores; public final List<SearchResultKeywordScore> keywordScores;
/** How many other potential results existed in the same domain */ /** How many other potential results existed in the same domain */
public int resultsFromDomain; public int resultsFromDomain;
public SearchResultItem(long val) { public SearchResultItem(long val) {
this.combinedId = val; this.combinedId = val;
this.scores = new ArrayList<>(16); this.keywordScores = new ArrayList<>(16);
} }
public EdgeId<EdgeUrl> getUrlId() { public EdgeId<EdgeUrl> getUrlId() {
@ -37,11 +37,11 @@ public class SearchResultItem {
} }
/* Used for evaluation */ /* Used for evaluation */
private transient double scoreValue = 1; private transient SearchResultPreliminaryScore scoreValue = null;
public void setScore(double score) { public void setScore(SearchResultPreliminaryScore score) {
scoreValue = score; scoreValue = score;
} }
public double getScore() { public SearchResultPreliminaryScore getScore() {
return scoreValue; return scoreValue;
} }

View File

@ -26,68 +26,17 @@ public final class SearchResultKeywordScore {
this.hasPriorityTerms = hasPriorityTerms; this.hasPriorityTerms = hasPriorityTerms;
} }
private boolean hasTermFlag(WordFlags flag) { public boolean hasTermFlag(WordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
} }
public double documentValue() { public int positionCount() {
long sum = 0; return Integer.bitCount(positions());
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
if (DocumentMetadata.hasFlags(encodedDocMetadata, DocumentFlags.Simple.asBit())) {
sum += 20;
} }
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13; public int tfIdf() {
if (rank < 0) return (int) WordMetadata.decodeTfidf(encodedWordMetadata);
sum += rank / 2;
else
sum += rank / 4;
return sum;
} }
public double termValue() {
double sum = 0;
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
if (hasTermFlag(WordFlags.Title)) {
sum -= 15;
}
if (hasTermFlag(WordFlags.Site) && positionBits != 0) {
sum -= 10;
} else if (hasTermFlag(WordFlags.SiteAdjacent) && positionBits != 0) {
sum -= 5;
}
if (hasTermFlag(WordFlags.Subjects)) {
sum -= 10;
}
if (hasTermFlag(WordFlags.NamesWords)) {
sum -= 1;
}
if (hasTermFlag(WordFlags.UrlDomain)) {
sum -= 5;
}
if (hasTermFlag(WordFlags.UrlPath)) {
sum -= 5;
}
sum -= tfIdf / 10.;
sum -= Integer.bitCount(positionBits) / 3.;
return sum;
}
public int subquery() { public int subquery() {
return subquery; return subquery;
} }
@ -138,8 +87,8 @@ public final class SearchResultKeywordScore {
return "SearchResultKeywordScore[" + return "SearchResultKeywordScore[" +
"set=" + subquery + ", " + "set=" + subquery + ", " +
"keyword=" + keyword + ", " + "keyword=" + keyword + ", " +
"encodedWordMetadata=" + encodedWordMetadata + ", " + "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
"encodedDocMetadata=" + encodedDocMetadata + ", " + "encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ", " +
"hasPriorityTerms=" + hasPriorityTerms + ']'; "hasPriorityTerms=" + hasPriorityTerms + ']';
} }

View File

@ -0,0 +1,42 @@
package nu.marginalia.index.client.model.results;
import org.jetbrains.annotations.NotNull;
import static java.lang.Boolean.compare;
import static java.lang.Integer.compare;
public record SearchResultPreliminaryScore(boolean hasSingleTermMatch,
boolean hasPriorityTerm,
int minNumberOfFlagsSet,
int minNumberOfPositions,
int overlappingPositions)
implements Comparable<SearchResultPreliminaryScore>
{
@Override
public int compareTo(@NotNull SearchResultPreliminaryScore other) {
int diff;
diff = compare(hasSingleTermMatch, other.hasSingleTermMatch);
if (diff != 0) return diff;
diff = compare(minNumberOfFlagsSet, other.minNumberOfFlagsSet);
if (diff != 0) return diff;
diff = compare(hasPriorityTerm, other.hasPriorityTerm);
if (diff != 0) return diff;
diff = compare(overlappingPositions, other.overlappingPositions);
if (diff != 0) return diff;
return compare(minNumberOfPositions, other.minNumberOfPositions);
}
public boolean isGreat() {
return hasSingleTermMatch || (minNumberOfFlagsSet >= 1 && overlappingPositions >= 1);
}
public boolean isEmpty() {
return minNumberOfFlagsSet == 0
&& minNumberOfPositions == 0
&& overlappingPositions == 0;
}
}

View File

@ -0,0 +1,25 @@
package nu.marginalia.index.client.model.results;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import lombok.ToString;
import java.util.Map;
@ToString
public class SearchResultRankingContext {
private final int docCount;
private final Object2IntOpenHashMap<String> termCounts = new Object2IntOpenHashMap<>(10, 0.5f);
public SearchResultRankingContext(int docCount, Map<String, Integer> termCounts) {
this.docCount = docCount;
this.termCounts.putAll(termCounts);
}
public int termFreqDocCount() {
return docCount;
}
public int frequency(String keyword) {
return termCounts.getOrDefault(keyword, 1);
}
}

View File

@ -9,7 +9,7 @@ import java.util.List;
@AllArgsConstructor @Getter @ToString @AllArgsConstructor @Getter @ToString
public class SearchResultSet { public class SearchResultSet {
public List<SearchResultItem> results; public List<SearchResultItem> results;
public SearchResultRankingContext rankingContext;
public int size() { public int size() {
return results.size(); return results.size();
} }

View File

@ -0,0 +1,32 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation libs.notnull
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
testImplementation libs.bundles.slf4j.test
implementation libs.guava
implementation libs.guice
implementation libs.commons.lang3
implementation libs.snakeyaml
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -0,0 +1,4 @@
# Process
Basic functionality for a Process. Processes must include this dependency to ensure
their loggers are configured properly!

View File

@ -1,4 +1,4 @@
package nu.marginalia.work_log; package nu.marginalia.process.log;
import com.google.errorprone.annotations.MustBeClosed; import com.google.errorprone.annotations.MustBeClosed;
import org.apache.logging.log4j.util.Strings; import org.apache.logging.log4j.util.Strings;

View File

@ -1,4 +1,4 @@
package nu.marginalia.work_log; package nu.marginalia.process.log;
public record WorkLogEntry(String id, String ts, String path, int cnt) { public record WorkLogEntry(String id, String ts, String path, int cnt) {
} }

View File

@ -0,0 +1,9 @@
log4j2.isThreadContextMapInheritable=true
status = info
appender.console.type = Console
appender.console.name = LogToConsole
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n
appender.console.filter.http.type = MarkerFilter
rootLogger.level = info
rootLogger.appenderRef.console.ref = LogToConsole

View File

@ -6,7 +6,7 @@ functions based on [POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling0
## Central Classes ## Central Classes
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java) * [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java)
## See Also ## See Also

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction; package nu.marginalia.keyword;
import nu.marginalia.keyword_extraction.extractors.*; import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder; import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.WordPatterns; import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener; import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
@ -73,6 +73,8 @@ public class DocumentKeywordExtractor {
} }
} }
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder, private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata, KeywordMetadata metadata,
DocumentLanguageData documentLanguageData) DocumentLanguageData documentLanguageData)
@ -88,7 +90,7 @@ public class DocumentKeywordExtractor {
} }
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) { if (matchesWordPattern(w)) {
wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed())); wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed()));
} }
} }
@ -101,4 +103,43 @@ public class DocumentKeywordExtractor {
} }
} }
} }
boolean matchesWordPattern(String s) {
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
String wordPartSeparator = ".-_/:+*";
int i = 0;
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
if (i == 0)
return false;
for (int j = 0; j < 5; j++) {
if (i == s.length()) return true;
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
return false;
}
i++;
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
}
return false;
}
} }

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction; package nu.marginalia.keyword;
import nu.marginalia.language.WordPatterns; import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.DocumentSentence;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction; package nu.marginalia.keyword;
import lombok.Builder; import lombok.Builder;
import nu.marginalia.keyword_extraction.extractors.*; import nu.marginalia.keyword.extractors.*;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction; package nu.marginalia.keyword;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;

View File

@ -1,8 +1,8 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import com.google.inject.Inject; import com.google.inject.Inject;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
/** Generates a position bitmask for each word in a document */ /** Generates a position bitmask for each word in a document */

View File

@ -1,13 +1,13 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import com.google.common.base.CharMatcher; import com.google.common.base.CharMatcher;
import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.WordReps; import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;
import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.keyword.KeywordExtractor;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;

View File

@ -1,12 +1,12 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.WordReps; import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan; import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.model.WordSeparator; import nu.marginalia.language.model.WordSeparator;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.util.*; import java.util.*;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import nu.marginalia.keyword_extraction.WordReps; import nu.marginalia.keyword.WordReps;
import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;

View File

@ -1,12 +1,12 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.WordReps; import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.WordPatterns; import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;
import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.model.WordSpan; import nu.marginalia.language.model.WordSpan;
import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.model; package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.model; package nu.marginalia.keyword.model;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter; import lombok.Getter;

View File

@ -0,0 +1,24 @@
package nu.marginalia.keyword;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
class DocumentKeywordExtractorTest {
@Test
public void testWordPattern() {
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(null);
Assertions.assertTrue(extractor.matchesWordPattern("test"));
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
Assertions.assertTrue(extractor.matchesWordPattern("c++"));
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
}
}

View File

@ -1,8 +1,7 @@
package nu.marginalia.keyword_extraction; package nu.marginalia.keyword;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan; import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
@ -106,10 +105,6 @@ class SentenceExtractorTest {
} }
@Test
public void testPattern() {
System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
}
@SneakyThrows @SneakyThrows
@Test @Test

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels; import nu.marginalia.test.util.TestLanguageModels;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels; import nu.marginalia.test.util.TestLanguageModels;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels; import nu.marginalia.test.util.TestLanguageModels;

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels; import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;

View File

@ -1,4 +1,4 @@
package nu.marginalia.keyword_extraction.extractors; package nu.marginalia.keyword.extractors;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.logic.pubdate; package nu.marginalia.pubdate;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -9,3 +9,4 @@
* [adblock](adblock/) - Simulates Adblock * [adblock](adblock/) - Simulates Adblock
* [pubdate](pubdate/) - Determines when a document was published * [pubdate](pubdate/) - Determines when a document was published
* [topic-detection](topic-detection/) - Tries to identify the topic of a website * [topic-detection](topic-detection/) - Tries to identify the topic of a website
* [summary-extraction](summary-extraction/)

View File

@ -1,7 +1,7 @@
plugins { plugins {
id 'java' id 'java'
id "io.freefair.lombok" version "5.3.3.3" id "io.freefair.lombok" version "5.3.3.3"
id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
} }
@ -11,26 +11,28 @@ java {
} }
} }
application {
mainClass = 'nu.marginalia.converting.ConverterMain'
applicationName = 'converter-process'
}
tasks.distZip.enabled = false
dependencies { dependencies {
implementation libs.notnull
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok
implementation libs.bundles.gson
implementation libs.rxjava
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
testImplementation libs.bundles.slf4j.test
implementation libs.guava implementation libs.notnull
implementation libs.guice
implementation libs.snakeyaml
implementation libs.jsoup implementation libs.jsoup
implementation libs.zstd
implementation libs.commons.net implementation libs.guice
implementation libs.guava
implementation libs.opencsv implementation libs.bundles.gson
implementation libs.trove
implementation libs.fastutil
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit
@ -38,6 +40,7 @@ dependencies {
} }
test { test {
maxHeapSize = "8G"
useJUnitPlatform() useJUnitPlatform()
} }

View File

@ -0,0 +1,17 @@
# Summary Extraction
This feature attempts to find a descriptive passage of text that summarizes
what a search result "is about". It's the text you see below a search result.
It uses several naive heuristics to try to find something that makes sense,
and there is probably room for improvement.
There are many good techniques for doing this, but they've sadly not proved
particularly fast. Whatever solution is used needs to be able to summarize of
order of a 100,000,000 documents with a time budget of a couple of hours.
## Central Classes
* [SummaryExtractor](src/main/java/nu/marginalia/summary/SummaryExtractor.java)
* [SummaryExtractionFilter](src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java) - DOM pruning algo.
Doesn't always work, but when it works it's pretty good.

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic.summary; package nu.marginalia.summary;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic.summary; package nu.marginalia.summary;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
@ -19,9 +19,8 @@ public class SummaryExtractor {
} }
public String extractSummary(Document parsed) { public String extractSummary(Document parsed) {
String summaryString; String summaryString = extractSummaryRaw(parsed);
summaryString = extractSummaryRaw(parsed);
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" "); summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength); summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
@ -81,7 +80,7 @@ public class SummaryExtractor {
} }
if (content.length() > 32) { if (content.length() > 32) {
// AAAA AAAA AAAA AAAA AAAA AAAA AAAA AAAA // AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
return content.toString(); return content.toString();
} }

View File

@ -1,17 +1,13 @@
package nu.marginalia.converting.logic; package nu.marginalia.summary;
import nu.marginalia.WmsaHome; import nu.marginalia.summary.SummaryExtractionFilter;
import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter; import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
@ -43,47 +39,6 @@ class SummaryExtractorTest {
System.out.println(e.getValue().text()); System.out.println(e.getValue().text());
}); });
} }
@Test
public void testSummaryFilter3() throws IOException {
var data = WmsaHome.getHomePath().resolve("test-data/url-327999153");
String html = Files.readString(data);
var doc = Jsoup.parse(html);
var filter = new SummaryExtractionFilter();
doc.filter(filter);
filter.getSummary(255);
}
@Test
public void testSummaryFilter2() throws IOException {
var data = WmsaHome.getHomePath().resolve("test-data/");
System.out.println("Running");
var fos = new PrintWriter(new FileOutputStream("/tmp/summaryDiff.html"));
fos.println("<table>");
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
var doc = Jsoup.parse(Files.readString(file.toPath()));
fos.println("<tr><th colspan=2>" + file.getName() + "</th></tr>");
fos.println("<tr><td width=50%>");
var filter = new SummaryExtractionFilter();
doc.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
doc.filter(filter);
var ret = filter.getSummary(255);
fos.println(ret);
fos.println("</td><td width=50%>");
String summary = summaryExtractor.extractSummary(Jsoup.parse(Files.readString(file.toPath())));
fos.println(summary);
fos.println("</td></tr>");
}
fos.println("</table>");
fos.flush();
}
@Test @Test
void extractSurrey() throws IOException { void extractSurrey() throws IOException {

Some files were not shown because too many files have changed in this diff Show More