mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
Yet more restructuring. Improved search result ranking.
This commit is contained in:
parent
5ef17a2a20
commit
449471a076
@ -28,7 +28,8 @@ dependencies {
|
||||
implementation libs.guice
|
||||
implementation libs.rxjava
|
||||
implementation libs.protobuf
|
||||
implementation libs.gson
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.fastutil
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
@ -30,9 +30,9 @@ public class IndexClient extends AbstractDynamicClient {
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public List<SearchResultItem> query(Context ctx, SearchSpecification specs) {
|
||||
public SearchResultSet query(Context ctx, SearchSpecification specs) {
|
||||
return wmsa_search_index_api_time.time(
|
||||
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst().getResults()
|
||||
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst()
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -15,14 +15,14 @@ public class SearchResultItem {
|
||||
public final long combinedId;
|
||||
|
||||
/** How did the subqueries match against the document ? */
|
||||
public final List<SearchResultKeywordScore> scores;
|
||||
public final List<SearchResultKeywordScore> keywordScores;
|
||||
|
||||
/** How many other potential results existed in the same domain */
|
||||
public int resultsFromDomain;
|
||||
|
||||
public SearchResultItem(long val) {
|
||||
this.combinedId = val;
|
||||
this.scores = new ArrayList<>(16);
|
||||
this.keywordScores = new ArrayList<>(16);
|
||||
}
|
||||
|
||||
public EdgeId<EdgeUrl> getUrlId() {
|
||||
@ -37,11 +37,11 @@ public class SearchResultItem {
|
||||
}
|
||||
|
||||
/* Used for evaluation */
|
||||
private transient double scoreValue = 1;
|
||||
public void setScore(double score) {
|
||||
private transient SearchResultPreliminaryScore scoreValue = null;
|
||||
public void setScore(SearchResultPreliminaryScore score) {
|
||||
scoreValue = score;
|
||||
}
|
||||
public double getScore() {
|
||||
public SearchResultPreliminaryScore getScore() {
|
||||
return scoreValue;
|
||||
}
|
||||
|
||||
|
@ -26,68 +26,17 @@ public final class SearchResultKeywordScore {
|
||||
this.hasPriorityTerms = hasPriorityTerms;
|
||||
}
|
||||
|
||||
private boolean hasTermFlag(WordFlags flag) {
|
||||
public boolean hasTermFlag(WordFlags flag) {
|
||||
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
||||
}
|
||||
|
||||
public double documentValue() {
|
||||
long sum = 0;
|
||||
|
||||
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
|
||||
|
||||
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
|
||||
|
||||
if (DocumentMetadata.hasFlags(encodedDocMetadata, DocumentFlags.Simple.asBit())) {
|
||||
sum += 20;
|
||||
public int positionCount() {
|
||||
return Integer.bitCount(positions());
|
||||
}
|
||||
|
||||
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
|
||||
if (rank < 0)
|
||||
sum += rank / 2;
|
||||
else
|
||||
sum += rank / 4;
|
||||
|
||||
return sum;
|
||||
public int tfIdf() {
|
||||
return (int) WordMetadata.decodeTfidf(encodedWordMetadata);
|
||||
}
|
||||
|
||||
public double termValue() {
|
||||
double sum = 0;
|
||||
|
||||
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
|
||||
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
|
||||
|
||||
if (hasTermFlag(WordFlags.Title)) {
|
||||
sum -= 15;
|
||||
}
|
||||
|
||||
if (hasTermFlag(WordFlags.Site) && positionBits != 0) {
|
||||
sum -= 10;
|
||||
} else if (hasTermFlag(WordFlags.SiteAdjacent) && positionBits != 0) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
if (hasTermFlag(WordFlags.Subjects)) {
|
||||
sum -= 10;
|
||||
}
|
||||
if (hasTermFlag(WordFlags.NamesWords)) {
|
||||
sum -= 1;
|
||||
}
|
||||
|
||||
if (hasTermFlag(WordFlags.UrlDomain)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
if (hasTermFlag(WordFlags.UrlPath)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
|
||||
sum -= tfIdf / 10.;
|
||||
sum -= Integer.bitCount(positionBits) / 3.;
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public int subquery() {
|
||||
return subquery;
|
||||
}
|
||||
@ -138,8 +87,8 @@ public final class SearchResultKeywordScore {
|
||||
return "SearchResultKeywordScore[" +
|
||||
"set=" + subquery + ", " +
|
||||
"keyword=" + keyword + ", " +
|
||||
"encodedWordMetadata=" + encodedWordMetadata + ", " +
|
||||
"encodedDocMetadata=" + encodedDocMetadata + ", " +
|
||||
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
|
||||
"encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ", " +
|
||||
"hasPriorityTerms=" + hasPriorityTerms + ']';
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,42 @@
|
||||
package nu.marginalia.index.client.model.results;
|
||||
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import static java.lang.Boolean.compare;
|
||||
import static java.lang.Integer.compare;
|
||||
|
||||
public record SearchResultPreliminaryScore(boolean hasSingleTermMatch,
|
||||
boolean hasPriorityTerm,
|
||||
int minNumberOfFlagsSet,
|
||||
int minNumberOfPositions,
|
||||
int overlappingPositions)
|
||||
implements Comparable<SearchResultPreliminaryScore>
|
||||
{
|
||||
@Override
|
||||
public int compareTo(@NotNull SearchResultPreliminaryScore other) {
|
||||
int diff;
|
||||
|
||||
diff = compare(hasSingleTermMatch, other.hasSingleTermMatch);
|
||||
if (diff != 0) return diff;
|
||||
|
||||
diff = compare(minNumberOfFlagsSet, other.minNumberOfFlagsSet);
|
||||
if (diff != 0) return diff;
|
||||
|
||||
diff = compare(hasPriorityTerm, other.hasPriorityTerm);
|
||||
if (diff != 0) return diff;
|
||||
|
||||
diff = compare(overlappingPositions, other.overlappingPositions);
|
||||
if (diff != 0) return diff;
|
||||
|
||||
return compare(minNumberOfPositions, other.minNumberOfPositions);
|
||||
}
|
||||
|
||||
public boolean isGreat() {
|
||||
return hasSingleTermMatch || (minNumberOfFlagsSet >= 1 && overlappingPositions >= 1);
|
||||
}
|
||||
public boolean isEmpty() {
|
||||
return minNumberOfFlagsSet == 0
|
||||
&& minNumberOfPositions == 0
|
||||
&& overlappingPositions == 0;
|
||||
}
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package nu.marginalia.index.client.model.results;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ToString
|
||||
public class SearchResultRankingContext {
|
||||
private final int docCount;
|
||||
private final Object2IntOpenHashMap<String> termCounts = new Object2IntOpenHashMap<>(10, 0.5f);
|
||||
|
||||
public SearchResultRankingContext(int docCount, Map<String, Integer> termCounts) {
|
||||
this.docCount = docCount;
|
||||
this.termCounts.putAll(termCounts);
|
||||
}
|
||||
|
||||
public int termFreqDocCount() {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
public int frequency(String keyword) {
|
||||
return termCounts.getOrDefault(keyword, 1);
|
||||
}
|
||||
}
|
@ -9,7 +9,7 @@ import java.util.List;
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class SearchResultSet {
|
||||
public List<SearchResultItem> results;
|
||||
|
||||
public SearchResultRankingContext rankingContext;
|
||||
public int size() {
|
||||
return results.size();
|
||||
}
|
||||
|
32
code/common/process/build.gradle
Normal file
32
code/common/process/build.gradle
Normal file
@ -0,0 +1,32 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation libs.notnull
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
|
||||
implementation libs.guava
|
||||
implementation libs.guice
|
||||
implementation libs.commons.lang3
|
||||
|
||||
implementation libs.snakeyaml
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
|
4
code/common/process/readme.md
Normal file
4
code/common/process/readme.md
Normal file
@ -0,0 +1,4 @@
|
||||
# Process
|
||||
|
||||
Basic functionality for a Process. Processes must include this dependency to ensure
|
||||
their loggers are configured properly!
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.work_log;
|
||||
package nu.marginalia.process.log;
|
||||
|
||||
import com.google.errorprone.annotations.MustBeClosed;
|
||||
import org.apache.logging.log4j.util.Strings;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.work_log;
|
||||
package nu.marginalia.process.log;
|
||||
|
||||
public record WorkLogEntry(String id, String ts, String path, int cnt) {
|
||||
}
|
9
code/common/process/src/main/resources/log4j2.properties
Normal file
9
code/common/process/src/main/resources/log4j2.properties
Normal file
@ -0,0 +1,9 @@
|
||||
log4j2.isThreadContextMapInheritable=true
|
||||
status = info
|
||||
appender.console.type = Console
|
||||
appender.console.name = LogToConsole
|
||||
appender.console.layout.type = PatternLayout
|
||||
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n
|
||||
appender.console.filter.http.type = MarkerFilter
|
||||
rootLogger.level = info
|
||||
rootLogger.appenderRef.console.ref = LogToConsole
|
@ -6,7 +6,7 @@ functions based on [POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling0
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java)
|
||||
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java)
|
||||
|
||||
## See Also
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.keyword_extraction;
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.keyword_extraction.extractors.*;
|
||||
import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
@ -73,6 +73,8 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData documentLanguageData)
|
||||
@ -88,7 +90,7 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
if (matchesWordPattern(w)) {
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed()));
|
||||
}
|
||||
}
|
||||
@ -101,4 +103,43 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boolean matchesWordPattern(String s) {
|
||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
||||
|
||||
String wordPartSeparator = ".-_/:+*";
|
||||
|
||||
int i = 0;
|
||||
|
||||
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == 0)
|
||||
return false;
|
||||
|
||||
for (int j = 0; j < 5; j++) {
|
||||
if (i == s.length()) return true;
|
||||
|
||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword_extraction;
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.keyword_extraction;
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.keyword_extraction.extractors.*;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword_extraction;
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
/** Generates a position bitmask for each word in a document */
|
@ -1,13 +1,13 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword_extraction.WordReps;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
@ -1,12 +1,12 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword_extraction.WordReps;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.model.WordSeparator;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.keyword_extraction.WordReps;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.model.EdgeDomain;
|
@ -1,12 +1,12 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword_extraction.WordReps;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword_extraction.model;
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword_extraction.model;
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
||||
import lombok.Getter;
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class DocumentKeywordExtractorTest {
|
||||
|
||||
@Test
|
||||
public void testWordPattern() {
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(null);
|
||||
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("test"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
|
||||
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("c++"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
|
||||
}
|
||||
}
|
@ -1,8 +1,7 @@
|
||||
package nu.marginalia.keyword_extraction;
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
@ -106,10 +105,6 @@ class SentenceExtractorTest {
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPattern() {
|
||||
System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.logic.pubdate;
|
||||
package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import org.junit.jupiter.api.Test;
|
@ -9,3 +9,4 @@
|
||||
* [adblock](adblock/) - Simulates Adblock
|
||||
* [pubdate](pubdate/) - Determines when a document was published
|
||||
* [topic-detection](topic-detection/) - Tries to identify the topic of a website
|
||||
* [summary-extraction](summary-extraction/)
|
@ -1,7 +1,7 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
@ -11,26 +11,28 @@ java {
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.converting.ConverterMain'
|
||||
applicationName = 'converter-process'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
dependencies {
|
||||
implementation libs.notnull
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.rxjava
|
||||
implementation libs.bundles.slf4j
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
|
||||
implementation libs.guava
|
||||
implementation libs.guice
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.snakeyaml
|
||||
implementation libs.jsoup
|
||||
implementation libs.zstd
|
||||
|
||||
implementation libs.commons.net
|
||||
|
||||
implementation libs.opencsv
|
||||
implementation libs.guice
|
||||
implementation libs.guava
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
@ -38,6 +40,7 @@ dependencies {
|
||||
}
|
||||
|
||||
test {
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
17
code/features-convert/summary-extraction/readme.md
Normal file
17
code/features-convert/summary-extraction/readme.md
Normal file
@ -0,0 +1,17 @@
|
||||
# Summary Extraction
|
||||
|
||||
This feature attempts to find a descriptive passage of text that summarizes
|
||||
what a search result "is about". It's the text you see below a search result.
|
||||
|
||||
It uses several naive heuristics to try to find something that makes sense,
|
||||
and there is probably room for improvement.
|
||||
|
||||
There are many good techniques for doing this, but they've sadly not proved
|
||||
particularly fast. Whatever solution is used needs to be able to summarize of
|
||||
order of a 100,000,000 documents with a time budget of a couple of hours.
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [SummaryExtractor](src/main/java/nu/marginalia/summary/SummaryExtractor.java)
|
||||
* [SummaryExtractionFilter](src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java) - DOM pruning algo.
|
||||
Doesn't always work, but when it works it's pretty good.
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic.summary;
|
||||
package nu.marginalia.summary;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import org.apache.commons.lang3.StringUtils;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic.summary;
|
||||
package nu.marginalia.summary;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
@ -19,9 +19,8 @@ public class SummaryExtractor {
|
||||
}
|
||||
|
||||
public String extractSummary(Document parsed) {
|
||||
String summaryString;
|
||||
String summaryString = extractSummaryRaw(parsed);
|
||||
|
||||
summaryString = extractSummaryRaw(parsed);
|
||||
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
|
||||
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
|
||||
|
||||
@ -81,7 +80,7 @@ public class SummaryExtractor {
|
||||
}
|
||||
|
||||
if (content.length() > 32) {
|
||||
// AAAA AAAA AAAA AAAA AAAA AAAA AAAA AAAA
|
||||
// AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
|
||||
return content.toString();
|
||||
}
|
||||
|
@ -1,17 +1,13 @@
|
||||
package nu.marginalia.converting.logic;
|
||||
package nu.marginalia.summary;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter;
|
||||
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
|
||||
import nu.marginalia.summary.SummaryExtractionFilter;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
@ -43,47 +39,6 @@ class SummaryExtractorTest {
|
||||
System.out.println(e.getValue().text());
|
||||
});
|
||||
}
|
||||
@Test
|
||||
public void testSummaryFilter3() throws IOException {
|
||||
var data = WmsaHome.getHomePath().resolve("test-data/url-327999153");
|
||||
String html = Files.readString(data);
|
||||
var doc = Jsoup.parse(html);
|
||||
var filter = new SummaryExtractionFilter();
|
||||
doc.filter(filter);
|
||||
|
||||
filter.getSummary(255);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSummaryFilter2() throws IOException {
|
||||
var data = WmsaHome.getHomePath().resolve("test-data/");
|
||||
|
||||
System.out.println("Running");
|
||||
|
||||
var fos = new PrintWriter(new FileOutputStream("/tmp/summaryDiff.html"));
|
||||
fos.println("<table>");
|
||||
|
||||
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
||||
|
||||
var doc = Jsoup.parse(Files.readString(file.toPath()));
|
||||
fos.println("<tr><th colspan=2>" + file.getName() + "</th></tr>");
|
||||
fos.println("<tr><td width=50%>");
|
||||
var filter = new SummaryExtractionFilter();
|
||||
|
||||
doc.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
|
||||
doc.filter(filter);
|
||||
var ret = filter.getSummary(255);
|
||||
|
||||
fos.println(ret);
|
||||
fos.println("</td><td width=50%>");
|
||||
String summary = summaryExtractor.extractSummary(Jsoup.parse(Files.readString(file.toPath())));
|
||||
fos.println(summary);
|
||||
fos.println("</td></tr>");
|
||||
}
|
||||
|
||||
fos.println("</table>");
|
||||
fos.flush();
|
||||
}
|
||||
|
||||
@Test
|
||||
void extractSurrey() throws IOException {
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user