mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Yet more restructuring. Improved search result ranking.
This commit is contained in:
parent
5ef17a2a20
commit
449471a076
@ -28,7 +28,8 @@ dependencies {
|
|||||||
implementation libs.guice
|
implementation libs.guice
|
||||||
implementation libs.rxjava
|
implementation libs.rxjava
|
||||||
implementation libs.protobuf
|
implementation libs.protobuf
|
||||||
implementation libs.gson
|
implementation libs.bundles.gson
|
||||||
|
implementation libs.fastutil
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
|
@ -30,9 +30,9 @@ public class IndexClient extends AbstractDynamicClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CheckReturnValue
|
@CheckReturnValue
|
||||||
public List<SearchResultItem> query(Context ctx, SearchSpecification specs) {
|
public SearchResultSet query(Context ctx, SearchSpecification specs) {
|
||||||
return wmsa_search_index_api_time.time(
|
return wmsa_search_index_api_time.time(
|
||||||
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst().getResults()
|
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,14 +15,14 @@ public class SearchResultItem {
|
|||||||
public final long combinedId;
|
public final long combinedId;
|
||||||
|
|
||||||
/** How did the subqueries match against the document ? */
|
/** How did the subqueries match against the document ? */
|
||||||
public final List<SearchResultKeywordScore> scores;
|
public final List<SearchResultKeywordScore> keywordScores;
|
||||||
|
|
||||||
/** How many other potential results existed in the same domain */
|
/** How many other potential results existed in the same domain */
|
||||||
public int resultsFromDomain;
|
public int resultsFromDomain;
|
||||||
|
|
||||||
public SearchResultItem(long val) {
|
public SearchResultItem(long val) {
|
||||||
this.combinedId = val;
|
this.combinedId = val;
|
||||||
this.scores = new ArrayList<>(16);
|
this.keywordScores = new ArrayList<>(16);
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeId<EdgeUrl> getUrlId() {
|
public EdgeId<EdgeUrl> getUrlId() {
|
||||||
@ -37,11 +37,11 @@ public class SearchResultItem {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Used for evaluation */
|
/* Used for evaluation */
|
||||||
private transient double scoreValue = 1;
|
private transient SearchResultPreliminaryScore scoreValue = null;
|
||||||
public void setScore(double score) {
|
public void setScore(SearchResultPreliminaryScore score) {
|
||||||
scoreValue = score;
|
scoreValue = score;
|
||||||
}
|
}
|
||||||
public double getScore() {
|
public SearchResultPreliminaryScore getScore() {
|
||||||
return scoreValue;
|
return scoreValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,68 +26,17 @@ public final class SearchResultKeywordScore {
|
|||||||
this.hasPriorityTerms = hasPriorityTerms;
|
this.hasPriorityTerms = hasPriorityTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasTermFlag(WordFlags flag) {
|
public boolean hasTermFlag(WordFlags flag) {
|
||||||
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
||||||
}
|
}
|
||||||
|
|
||||||
public double documentValue() {
|
public int positionCount() {
|
||||||
long sum = 0;
|
return Integer.bitCount(positions());
|
||||||
|
|
||||||
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
|
|
||||||
|
|
||||||
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
|
|
||||||
|
|
||||||
if (DocumentMetadata.hasFlags(encodedDocMetadata, DocumentFlags.Simple.asBit())) {
|
|
||||||
sum += 20;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
|
public int tfIdf() {
|
||||||
if (rank < 0)
|
return (int) WordMetadata.decodeTfidf(encodedWordMetadata);
|
||||||
sum += rank / 2;
|
|
||||||
else
|
|
||||||
sum += rank / 4;
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public double termValue() {
|
|
||||||
double sum = 0;
|
|
||||||
|
|
||||||
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
|
|
||||||
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
|
|
||||||
|
|
||||||
if (hasTermFlag(WordFlags.Title)) {
|
|
||||||
sum -= 15;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasTermFlag(WordFlags.Site) && positionBits != 0) {
|
|
||||||
sum -= 10;
|
|
||||||
} else if (hasTermFlag(WordFlags.SiteAdjacent) && positionBits != 0) {
|
|
||||||
sum -= 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasTermFlag(WordFlags.Subjects)) {
|
|
||||||
sum -= 10;
|
|
||||||
}
|
|
||||||
if (hasTermFlag(WordFlags.NamesWords)) {
|
|
||||||
sum -= 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasTermFlag(WordFlags.UrlDomain)) {
|
|
||||||
sum -= 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasTermFlag(WordFlags.UrlPath)) {
|
|
||||||
sum -= 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
sum -= tfIdf / 10.;
|
|
||||||
sum -= Integer.bitCount(positionBits) / 3.;
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int subquery() {
|
public int subquery() {
|
||||||
return subquery;
|
return subquery;
|
||||||
}
|
}
|
||||||
@ -138,8 +87,8 @@ public final class SearchResultKeywordScore {
|
|||||||
return "SearchResultKeywordScore[" +
|
return "SearchResultKeywordScore[" +
|
||||||
"set=" + subquery + ", " +
|
"set=" + subquery + ", " +
|
||||||
"keyword=" + keyword + ", " +
|
"keyword=" + keyword + ", " +
|
||||||
"encodedWordMetadata=" + encodedWordMetadata + ", " +
|
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
|
||||||
"encodedDocMetadata=" + encodedDocMetadata + ", " +
|
"encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ", " +
|
||||||
"hasPriorityTerms=" + hasPriorityTerms + ']';
|
"hasPriorityTerms=" + hasPriorityTerms + ']';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,42 @@
|
|||||||
|
package nu.marginalia.index.client.model.results;
|
||||||
|
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import static java.lang.Boolean.compare;
|
||||||
|
import static java.lang.Integer.compare;
|
||||||
|
|
||||||
|
public record SearchResultPreliminaryScore(boolean hasSingleTermMatch,
|
||||||
|
boolean hasPriorityTerm,
|
||||||
|
int minNumberOfFlagsSet,
|
||||||
|
int minNumberOfPositions,
|
||||||
|
int overlappingPositions)
|
||||||
|
implements Comparable<SearchResultPreliminaryScore>
|
||||||
|
{
|
||||||
|
@Override
|
||||||
|
public int compareTo(@NotNull SearchResultPreliminaryScore other) {
|
||||||
|
int diff;
|
||||||
|
|
||||||
|
diff = compare(hasSingleTermMatch, other.hasSingleTermMatch);
|
||||||
|
if (diff != 0) return diff;
|
||||||
|
|
||||||
|
diff = compare(minNumberOfFlagsSet, other.minNumberOfFlagsSet);
|
||||||
|
if (diff != 0) return diff;
|
||||||
|
|
||||||
|
diff = compare(hasPriorityTerm, other.hasPriorityTerm);
|
||||||
|
if (diff != 0) return diff;
|
||||||
|
|
||||||
|
diff = compare(overlappingPositions, other.overlappingPositions);
|
||||||
|
if (diff != 0) return diff;
|
||||||
|
|
||||||
|
return compare(minNumberOfPositions, other.minNumberOfPositions);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isGreat() {
|
||||||
|
return hasSingleTermMatch || (minNumberOfFlagsSet >= 1 && overlappingPositions >= 1);
|
||||||
|
}
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return minNumberOfFlagsSet == 0
|
||||||
|
&& minNumberOfPositions == 0
|
||||||
|
&& overlappingPositions == 0;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,25 @@
|
|||||||
|
package nu.marginalia.index.client.model.results;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||||
|
import lombok.ToString;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ToString
|
||||||
|
public class SearchResultRankingContext {
|
||||||
|
private final int docCount;
|
||||||
|
private final Object2IntOpenHashMap<String> termCounts = new Object2IntOpenHashMap<>(10, 0.5f);
|
||||||
|
|
||||||
|
public SearchResultRankingContext(int docCount, Map<String, Integer> termCounts) {
|
||||||
|
this.docCount = docCount;
|
||||||
|
this.termCounts.putAll(termCounts);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int termFreqDocCount() {
|
||||||
|
return docCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int frequency(String keyword) {
|
||||||
|
return termCounts.getOrDefault(keyword, 1);
|
||||||
|
}
|
||||||
|
}
|
@ -9,7 +9,7 @@ import java.util.List;
|
|||||||
@AllArgsConstructor @Getter @ToString
|
@AllArgsConstructor @Getter @ToString
|
||||||
public class SearchResultSet {
|
public class SearchResultSet {
|
||||||
public List<SearchResultItem> results;
|
public List<SearchResultItem> results;
|
||||||
|
public SearchResultRankingContext rankingContext;
|
||||||
public int size() {
|
public int size() {
|
||||||
return results.size();
|
return results.size();
|
||||||
}
|
}
|
||||||
|
32
code/common/process/build.gradle
Normal file
32
code/common/process/build.gradle
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id "io.freefair.lombok" version "5.3.3.3"
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
|
||||||
|
implementation libs.guava
|
||||||
|
implementation libs.guice
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
|
||||||
|
implementation libs.snakeyaml
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
||||||
|
|
4
code/common/process/readme.md
Normal file
4
code/common/process/readme.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# Process
|
||||||
|
|
||||||
|
Basic functionality for a Process. Processes must include this dependency to ensure
|
||||||
|
their loggers are configured properly!
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.work_log;
|
package nu.marginalia.process.log;
|
||||||
|
|
||||||
import com.google.errorprone.annotations.MustBeClosed;
|
import com.google.errorprone.annotations.MustBeClosed;
|
||||||
import org.apache.logging.log4j.util.Strings;
|
import org.apache.logging.log4j.util.Strings;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.work_log;
|
package nu.marginalia.process.log;
|
||||||
|
|
||||||
public record WorkLogEntry(String id, String ts, String path, int cnt) {
|
public record WorkLogEntry(String id, String ts, String path, int cnt) {
|
||||||
}
|
}
|
9
code/common/process/src/main/resources/log4j2.properties
Normal file
9
code/common/process/src/main/resources/log4j2.properties
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
log4j2.isThreadContextMapInheritable=true
|
||||||
|
status = info
|
||||||
|
appender.console.type = Console
|
||||||
|
appender.console.name = LogToConsole
|
||||||
|
appender.console.layout.type = PatternLayout
|
||||||
|
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n
|
||||||
|
appender.console.filter.http.type = MarkerFilter
|
||||||
|
rootLogger.level = info
|
||||||
|
rootLogger.appenderRef.console.ref = LogToConsole
|
@ -6,7 +6,7 @@ functions based on [POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling0
|
|||||||
|
|
||||||
## Central Classes
|
## Central Classes
|
||||||
|
|
||||||
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java)
|
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java)
|
||||||
|
|
||||||
## See Also
|
## See Also
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.keyword_extraction;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
import nu.marginalia.keyword_extraction.extractors.*;
|
import nu.marginalia.keyword.extractors.*;
|
||||||
import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.WordPatterns;
|
import nu.marginalia.language.WordPatterns;
|
||||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
@ -73,6 +73,8 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
||||||
KeywordMetadata metadata,
|
KeywordMetadata metadata,
|
||||||
DocumentLanguageData documentLanguageData)
|
DocumentLanguageData documentLanguageData)
|
||||||
@ -88,7 +90,7 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
if (matchesWordPattern(w)) {
|
||||||
wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed()));
|
wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -101,4 +103,43 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean matchesWordPattern(String s) {
|
||||||
|
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
||||||
|
|
||||||
|
String wordPartSeparator = ".-_/:+*";
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
||||||
|
char c = s.charAt(i);
|
||||||
|
if (c >= 'a' && c <= 'z') continue;
|
||||||
|
if (c >= 'A' && c <= 'Z') continue;
|
||||||
|
if (c >= '0' && c <= '9') continue;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (int j = 0; j < 5; j++) {
|
||||||
|
if (i == s.length()) return true;
|
||||||
|
|
||||||
|
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
i++;
|
||||||
|
|
||||||
|
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
||||||
|
char c = s.charAt(i);
|
||||||
|
if (c >= 'a' && c <= 'z') continue;
|
||||||
|
if (c >= 'A' && c <= 'Z') continue;
|
||||||
|
if (c >= '0' && c <= '9') continue;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.keyword_extraction;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
import nu.marginalia.language.WordPatterns;
|
import nu.marginalia.language.WordPatterns;
|
||||||
import nu.marginalia.language.model.DocumentSentence;
|
import nu.marginalia.language.model.DocumentSentence;
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.keyword_extraction;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import nu.marginalia.keyword_extraction.extractors.*;
|
import nu.marginalia.keyword.extractors.*;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.keyword_extraction;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
import nu.marginalia.keyword.KeywordExtractor;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
|
||||||
/** Generates a position bitmask for each word in a document */
|
/** Generates a position bitmask for each word in a document */
|
@ -1,13 +1,13 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import com.google.common.base.CharMatcher;
|
import com.google.common.base.CharMatcher;
|
||||||
import it.unimi.dsi.fastutil.objects.Object2IntMap;
|
import it.unimi.dsi.fastutil.objects.Object2IntMap;
|
||||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||||
import nu.marginalia.keyword_extraction.WordReps;
|
import nu.marginalia.keyword.WordReps;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.language.model.DocumentSentence;
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
import nu.marginalia.keyword.KeywordExtractor;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
@ -1,12 +1,12 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||||
import nu.marginalia.keyword_extraction.WordReps;
|
import nu.marginalia.keyword.KeywordExtractor;
|
||||||
|
import nu.marginalia.keyword.WordReps;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
import nu.marginalia.language.model.WordSpan;
|
import nu.marginalia.language.model.WordSpan;
|
||||||
import nu.marginalia.language.model.WordSeparator;
|
import nu.marginalia.language.model.WordSeparator;
|
||||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import nu.marginalia.keyword_extraction.WordReps;
|
import nu.marginalia.keyword.WordReps;
|
||||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
import nu.marginalia.keyword.KeywordExtractor;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
@ -1,12 +1,12 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||||
import nu.marginalia.keyword_extraction.WordReps;
|
import nu.marginalia.keyword.WordReps;
|
||||||
import nu.marginalia.language.WordPatterns;
|
import nu.marginalia.language.WordPatterns;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.language.model.DocumentSentence;
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
import nu.marginalia.keyword.KeywordExtractor;
|
||||||
import nu.marginalia.language.model.WordSpan;
|
import nu.marginalia.language.model.WordSpan;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.keyword_extraction.model;
|
package nu.marginalia.keyword.model;
|
||||||
|
|
||||||
|
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.keyword_extraction.model;
|
package nu.marginalia.keyword.model;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class DocumentKeywordExtractorTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWordPattern() {
|
||||||
|
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(null);
|
||||||
|
|
||||||
|
Assertions.assertTrue(extractor.matchesWordPattern("test"));
|
||||||
|
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
|
||||||
|
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
|
||||||
|
|
||||||
|
Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
|
||||||
|
Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
|
||||||
|
Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
|
||||||
|
Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
|
||||||
|
Assertions.assertTrue(extractor.matchesWordPattern("c++"));
|
||||||
|
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
|
||||||
|
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
|
||||||
|
}
|
||||||
|
}
|
@ -1,8 +1,7 @@
|
|||||||
package nu.marginalia.keyword_extraction;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.language.WordPatterns;
|
|
||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
import nu.marginalia.language.model.WordSpan;
|
import nu.marginalia.language.model.WordSpan;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
@ -106,10 +105,6 @@ class SentenceExtractorTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testPattern() {
|
|
||||||
System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Test
|
@Test
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.test.util.TestLanguageModels;
|
import nu.marginalia.test.util.TestLanguageModels;
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
import nu.marginalia.keyword.KeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.test.util.TestLanguageModels;
|
import nu.marginalia.test.util.TestLanguageModels;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
import nu.marginalia.keyword.KeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import nu.marginalia.test.util.TestLanguageModels;
|
import nu.marginalia.test.util.TestLanguageModels;
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
import nu.marginalia.keyword.KeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.test.util.TestLanguageModels;
|
import nu.marginalia.test.util.TestLanguageModels;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.keyword_extraction.extractors;
|
package nu.marginalia.keyword.extractors;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.logic.pubdate;
|
package nu.marginalia.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
@ -9,3 +9,4 @@
|
|||||||
* [adblock](adblock/) - Simulates Adblock
|
* [adblock](adblock/) - Simulates Adblock
|
||||||
* [pubdate](pubdate/) - Determines when a document was published
|
* [pubdate](pubdate/) - Determines when a document was published
|
||||||
* [topic-detection](topic-detection/) - Tries to identify the topic of a website
|
* [topic-detection](topic-detection/) - Tries to identify the topic of a website
|
||||||
|
* [summary-extraction](summary-extraction/)
|
@ -1,7 +1,7 @@
|
|||||||
plugins {
|
plugins {
|
||||||
id 'java'
|
id 'java'
|
||||||
id "io.freefair.lombok" version "5.3.3.3"
|
id "io.freefair.lombok" version "5.3.3.3"
|
||||||
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -11,26 +11,28 @@ java {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
application {
|
||||||
|
mainClass = 'nu.marginalia.converting.ConverterMain'
|
||||||
|
applicationName = 'converter-process'
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation libs.notnull
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
|
||||||
implementation libs.bundles.gson
|
|
||||||
implementation libs.rxjava
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
testImplementation libs.bundles.slf4j.test
|
|
||||||
|
|
||||||
implementation libs.guava
|
implementation libs.notnull
|
||||||
implementation libs.guice
|
|
||||||
|
|
||||||
implementation libs.snakeyaml
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.zstd
|
|
||||||
|
|
||||||
implementation libs.commons.net
|
implementation libs.guice
|
||||||
|
implementation libs.guava
|
||||||
implementation libs.opencsv
|
implementation libs.bundles.gson
|
||||||
|
implementation libs.trove
|
||||||
|
implementation libs.fastutil
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
@ -38,6 +40,7 @@ dependencies {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test {
|
test {
|
||||||
|
maxHeapSize = "8G"
|
||||||
useJUnitPlatform()
|
useJUnitPlatform()
|
||||||
}
|
}
|
||||||
|
|
17
code/features-convert/summary-extraction/readme.md
Normal file
17
code/features-convert/summary-extraction/readme.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# Summary Extraction
|
||||||
|
|
||||||
|
This feature attempts to find a descriptive passage of text that summarizes
|
||||||
|
what a search result "is about". It's the text you see below a search result.
|
||||||
|
|
||||||
|
It uses several naive heuristics to try to find something that makes sense,
|
||||||
|
and there is probably room for improvement.
|
||||||
|
|
||||||
|
There are many good techniques for doing this, but they've sadly not proved
|
||||||
|
particularly fast. Whatever solution is used needs to be able to summarize of
|
||||||
|
order of a 100,000,000 documents with a time budget of a couple of hours.
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [SummaryExtractor](src/main/java/nu/marginalia/summary/SummaryExtractor.java)
|
||||||
|
* [SummaryExtractionFilter](src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java) - DOM pruning algo.
|
||||||
|
Doesn't always work, but when it works it's pretty good.
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.processor.logic.summary;
|
package nu.marginalia.summary;
|
||||||
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.processor.logic.summary;
|
package nu.marginalia.summary;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
@ -19,9 +19,8 @@ public class SummaryExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String extractSummary(Document parsed) {
|
public String extractSummary(Document parsed) {
|
||||||
String summaryString;
|
String summaryString = extractSummaryRaw(parsed);
|
||||||
|
|
||||||
summaryString = extractSummaryRaw(parsed);
|
|
||||||
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
|
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
|
||||||
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
|
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
|
||||||
|
|
||||||
@ -81,7 +80,7 @@ public class SummaryExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (content.length() > 32) {
|
if (content.length() > 32) {
|
||||||
// AAAA AAAA AAAA AAAA AAAA AAAA AAAA AAAA
|
// AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
|
||||||
return content.toString();
|
return content.toString();
|
||||||
}
|
}
|
||||||
|
|
@ -1,17 +1,13 @@
|
|||||||
package nu.marginalia.converting.logic;
|
package nu.marginalia.summary;
|
||||||
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.summary.SummaryExtractionFilter;
|
||||||
import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter;
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -43,47 +39,6 @@ class SummaryExtractorTest {
|
|||||||
System.out.println(e.getValue().text());
|
System.out.println(e.getValue().text());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@Test
|
|
||||||
public void testSummaryFilter3() throws IOException {
|
|
||||||
var data = WmsaHome.getHomePath().resolve("test-data/url-327999153");
|
|
||||||
String html = Files.readString(data);
|
|
||||||
var doc = Jsoup.parse(html);
|
|
||||||
var filter = new SummaryExtractionFilter();
|
|
||||||
doc.filter(filter);
|
|
||||||
|
|
||||||
filter.getSummary(255);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSummaryFilter2() throws IOException {
|
|
||||||
var data = WmsaHome.getHomePath().resolve("test-data/");
|
|
||||||
|
|
||||||
System.out.println("Running");
|
|
||||||
|
|
||||||
var fos = new PrintWriter(new FileOutputStream("/tmp/summaryDiff.html"));
|
|
||||||
fos.println("<table>");
|
|
||||||
|
|
||||||
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
|
||||||
|
|
||||||
var doc = Jsoup.parse(Files.readString(file.toPath()));
|
|
||||||
fos.println("<tr><th colspan=2>" + file.getName() + "</th></tr>");
|
|
||||||
fos.println("<tr><td width=50%>");
|
|
||||||
var filter = new SummaryExtractionFilter();
|
|
||||||
|
|
||||||
doc.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
|
|
||||||
doc.filter(filter);
|
|
||||||
var ret = filter.getSummary(255);
|
|
||||||
|
|
||||||
fos.println(ret);
|
|
||||||
fos.println("</td><td width=50%>");
|
|
||||||
String summary = summaryExtractor.extractSummary(Jsoup.parse(Files.readString(file.toPath())));
|
|
||||||
fos.println(summary);
|
|
||||||
fos.println("</td></tr>");
|
|
||||||
}
|
|
||||||
|
|
||||||
fos.println("</table>");
|
|
||||||
fos.flush();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void extractSurrey() throws IOException {
|
void extractSurrey() throws IOException {
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user