Merge pull request 'UX improvements for "show more results".' (#116) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/116
This commit is contained in:
Viktor Lofgren 2022-09-15 15:57:15 +02:00
commit ac7c1772de
21 changed files with 362 additions and 102 deletions

View File

@ -28,11 +28,11 @@ import java.nio.file.Path;
import java.time.Duration; import java.time.Duration;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
@Tag("e2e") @Tag("e2e")
@Testcontainers @Testcontainers
@ -188,7 +188,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html)); assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
} }
@ -201,7 +201,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertNotEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html)); assertEquals(Collections.emptyList(), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js"));
} }
@ -214,7 +214,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html)); assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js"));
} }

View File

@ -31,7 +31,7 @@ public class WordPatterns {
public static final Set<String> topWords; public static final Set<String> topWords;
static { static {
topWords = new HashSet<>(200); topWords = new HashSet<>(200, 0.25f);
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-stopwords"), try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-stopwords"),
"Could not load word frequency table"); "Could not load word frequency table");
var br = new BufferedReader(new InputStreamReader(resource)) var br = new BufferedReader(new InputStreamReader(resource))
@ -87,11 +87,33 @@ public class WordPatterns {
return true; return true;
} }
public static boolean hasWordQualities(String s) {
int start = 0;
int end = s.length();
if (s.charAt(0) == '#') start++;
if (end > 1 && s.charAt(end-1) == '#') end--;
for (int i = start; i < end; i++) {
char c = s.charAt(i);
if (!("_@.'+-".indexOf(c) >= 0)
&& !(c >= 'a' && c <= 'z')
&& !(c >= 'A' && c <= 'Z')
&& !(c >= '0' && c <= '9')
&& !(c >= '\u00C0' && c <= '\u00D6')
&& !(c >= '\u00D8' && c <= '\u00f6')
&& !(c >= '\u00f8' && c <= '\u00ff')) {
return false;
}
}
return true;
}
public static boolean isStopWord(String s) { public static boolean isStopWord(String s) {
if (s.length() < MIN_WORD_LENGTH) { if (s.length() < MIN_WORD_LENGTH) {
return true; return true;
} }
if (!wordQualitiesPredicate.test(s)) { if (!hasWordQualities(s)) {
return true; return true;
} }
if (!filter(s)) { if (!filter(s)) {

View File

@ -1,16 +1,23 @@
package nu.marginalia.util.language.processing; package nu.marginalia.util.language.processing;
import java.util.function.Predicate;
import java.util.regex.Pattern; import java.util.regex.Pattern;
public class AsciiFlattener { public class AsciiFlattener {
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+"); private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+");
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$");
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
private static boolean isPlainAscii(String s) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if ((c & 0x80) != 0) {
return false;
}
}
return true;
}
public static String flattenUnicode(String s) { public static String flattenUnicode(String s) {
if (plainAscii.test(s)) {
if (isPlainAscii(s)) {
return s; return s;
} }

View File

@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import javax.inject.Inject; import javax.inject.Inject;
import java.util.*; import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public class DocumentKeywordExtractor { public class DocumentKeywordExtractor {
@ -156,13 +157,16 @@ public class DocumentKeywordExtractor {
} }
} }
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) { private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
Set<String> reps = new HashSet<>(); Set<String> reps = new HashSet<>();
for (var sent : documentLanguageData.sentences) { for (var sent : documentLanguageData.sentences) {
for (var word : sent) { for (var word : sent) {
String lc = word.wordLowerCase(); String lc = word.wordLowerCase();
if (lc.matches("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+")) { if (lc.length() > 6
&& lc.indexOf('@') > 0
&& mailLikePattern.matcher(lc).matches()) {
reps.add(lc); reps.add(lc);
String domain = lc.substring(lc.indexOf('@')); String domain = lc.substring(lc.indexOf('@'));
@ -189,6 +193,6 @@ public class DocumentKeywordExtractor {
} }
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) { public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet())); return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns::hasWordQualities).collect(Collectors.toSet()));
} }
} }

View File

@ -23,8 +23,8 @@ public class KeywordCounter {
} }
public WordHistogram countHisto(DocumentLanguageData dld) { public WordHistogram countHisto(DocumentLanguageData dld) {
HashMap<String, Integer> counts = new HashMap<>(1000); HashMap<String, Integer> counts = new HashMap<>(15000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000); HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
for (var sent : dld.sentences) { for (var sent : dld.sentences) {
@ -37,15 +37,15 @@ public class KeywordCounter {
String stemmed = sent.constructStemmedWordFromSpan(span); String stemmed = sent.constructStemmedWordFromSpan(span);
counts.merge(stemmed, 1, Integer::sum); counts.merge(stemmed, 1, Integer::sum);
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span)); instances.computeIfAbsent(stemmed, k -> new HashSet<>(500)).add(new WordRep(sent, span));
} }
} }
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1); double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
Set<WordRep> h5 = new HashSet<>(); Set<WordRep> h5 = new HashSet<>(2500);
Set<WordRep> h10 = new HashSet<>(); Set<WordRep> h10 = new HashSet<>(500);
Set<WordRep> h15 = new HashSet<>(); Set<WordRep> h15 = new HashSet<>(500);
int doubleWordCount = 0; int doubleWordCount = 0;
@ -65,13 +65,14 @@ public class KeywordCounter {
histogram.addAll(instances.get(wordStemmed)); histogram.addAll(instances.get(wordStemmed));
} }
return new WordHistogram(h5, h10, h15); return new WordHistogram(h5, h10, h15);
} }
private static final Pattern separator = Pattern.compile("_"); private static final Pattern separator = Pattern.compile("_");
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) { public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
String key = e.getKey();
if (key.contains("_")) {
String[] parts = separator.split(e.getKey()); String[] parts = separator.split(e.getKey());
double totalValue = 0.; double totalValue = 0.;
for (String part : parts) { for (String part : parts) {
@ -79,6 +80,10 @@ public class KeywordCounter {
} }
return totalValue / parts.length; return totalValue / parts.length;
} }
else {
return value(key, e.getValue(), maxValue);
}
}
double value(String key, double value, double maxValue) { double value(String key, double value, double maxValue) {
double freq = dict.getTermFreqStemmed(key); double freq = dict.getTermFreqStemmed(key);

View File

@ -6,10 +6,10 @@ import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter; import lombok.Getter;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.tag.WordSeparator; import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import nu.marginalia.util.language.conf.LanguageModels;
import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer; import opennlp.tools.stemmer.PorterStemmer;
@ -127,8 +127,9 @@ public class SentenceExtractor {
private static final Pattern dotPattern = Pattern.compile("\\.+$"); private static final Pattern dotPattern = Pattern.compile("\\.+$");
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
private static final Pattern spacesPattern = Pattern.compile("\\s+");
private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))"); private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
private static final Pattern possessivePattern = Pattern.compile("'(s)?$");
public DocumentSentence extractSentence(String text) { public DocumentSentence extractSentence(String text) {
var wordsAndSeps = splitSegment(text); var wordsAndSeps = splitSegment(text);
@ -142,10 +143,20 @@ public class SentenceExtractor {
); );
} }
public String normalizeSpaces(String s) {
if (s.indexOf('\t') >= 0) {
s = s.replace('\t', ' ');
}
if (s.indexOf('\n') >= 0) {
s = s.replace('\n', ' ');
}
return s;
}
public DocumentSentence[] extractSentencesFromString(String text) { public DocumentSentence[] extractSentencesFromString(String text) {
String[] sentences; String[] sentences;
String textNormalizedSpaces = text.replaceAll("\\s", " "); String textNormalizedSpaces = normalizeSpaces(text);
try { try {
sentences = sentenceDetector.sentDetect(textNormalizedSpaces); sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
} }
@ -157,10 +168,17 @@ public class SentenceExtractor {
sentences = Arrays.copyOf(sentences, 250); sentences = Arrays.copyOf(sentences, 250);
} }
sentences = Arrays.stream(sentences) List<String> sentenceList = new ArrayList<>();
.filter(s -> !s.isBlank()) for (var s : sentences) {
.flatMap(s -> Arrays.stream(splitPattern.split(s))) if (s.isBlank()) continue;
.toArray(String[]::new); if (s.contains("-") || s.contains("|")) {
sentenceList.addAll(Arrays.asList(splitPattern.split(s)));
}
else {
sentenceList.add(s);
}
}
sentences = sentenceList.toArray(String[]::new);
final String[][] tokens = new String[sentences.length][]; final String[][] tokens = new String[sentences.length][];
final int[][] separators = new int[sentences.length][]; final int[][] separators = new int[sentences.length][];
@ -178,7 +196,9 @@ public class SentenceExtractor {
separators[i] = Arrays.copyOf(separators[i], 250); separators[i] = Arrays.copyOf(separators[i], 250);
} }
for (int j = 0; j < tokens[i].length; j++) { for (int j = 0; j < tokens[i].length; j++) {
tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll( ""); if (tokens[i][j].endsWith(".")) {
tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll("");
}
} }
} }
@ -204,7 +224,7 @@ public class SentenceExtractor {
private String[] stemSentence(String[] strings) { private String[] stemSentence(String[] strings) {
String[] stemmed = new String[strings.length]; String[] stemmed = new String[strings.length];
for (int i = 0; i < stemmed.length; i++) { for (int i = 0; i < stemmed.length; i++) {
var sent = possessivePattern.matcher(strings[i]).replaceAll(""); var sent = cleanPossessive(strings[i]);
try { try {
stemmed[i] = porterStemmer.stem(sent); stemmed[i] = porterStemmer.stem(sent);
} }
@ -215,10 +235,23 @@ public class SentenceExtractor {
return stemmed; return stemmed;
} }
private String cleanPossessive(String s) {
int end = s.length();
if (s.endsWith("\'")) {
return s.substring(0, end-1);
} else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) {
return s.substring(0, end-2).toLowerCase();
}
else {
return s;
}
}
private String[] toLc(String[] words) { private String[] toLc(String[] words) {
String[] lower = new String[words.length]; String[] lower = new String[words.length];
for (int i = 0; i < lower.length; i++) { for (int i = 0; i < lower.length; i++) {
lower[i] = possessivePattern.matcher(words[i].toLowerCase()).replaceAll(""); lower[i] = cleanPossessive(words[i]).toLowerCase();
} }
return lower; return lower;
} }

View File

@ -8,7 +8,6 @@ import java.lang.ref.SoftReference;
import java.util.BitSet; import java.util.BitSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.StringJoiner; import java.util.StringJoiner;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
@ -53,27 +52,71 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
return words.length; return words.length;
} }
private final static Pattern trailingJunkPattern = Pattern.compile("(^[\"'_*]+|[_*'\"]+$)"); private String removeJunk(String s) {
private final static Pattern joinerPattern = Pattern.compile("[-+.]+"); int start = 0;
int end = s.length();
for (; start < end; start++) {
if ("\"'_*".indexOf(s.charAt(start)) < 0)
break;
}
for (; end > start; end--) {
if ("\"'_*".indexOf(s.charAt(end-1)) < 0)
break;
}
if (start > 0 || end < s.length()) {
return s.substring(start, end);
}
else {
return s;
}
}
public String constructWordFromSpan(WordSpan span) { public String constructWordFromSpan(WordSpan span) {
if (span.size() == 1) {
return removeJunk(wordsLowerCase[span.start]);
}
else {
StringJoiner sj = new StringJoiner("_"); StringJoiner sj = new StringJoiner("_");
for (int i = span.start; i < span.end; i++) { for (int i = span.start; i < span.end; i++) {
sj.add(wordsLowerCase[i]); sj.add(wordsLowerCase[i]);
} }
return removeJunk(sj.toString());
return trailingJunkPattern.matcher(sj.toString()).replaceAll(""); }
} }
private String normalizeJoiner(String s) {
if (s.indexOf('+') >= 0) {
s = s.replace('+', '_');
}
if (s.indexOf('.') >= 0) {
s = s.replace('.', '_');
}
if (s.indexOf('-') >= 0) {
s = s.replace('-', '_');
}
return s;
}
public String constructStemmedWordFromSpan(WordSpan span) { public String constructStemmedWordFromSpan(WordSpan span) {
if (span.size() > 1) {
StringJoiner sj = new StringJoiner("_"); StringJoiner sj = new StringJoiner("_");
for (int i = span.start; i < span.end; i++) { for (int i = span.start; i < span.end; i++) {
if (includeInStemming(i)) if (includeInStemming(i))
sj.add(joinerPattern.matcher(stemmedWords[i]).replaceAll("_")); sj.add(normalizeJoiner(stemmedWords[i]));
} }
return sj.toString(); return sj.toString();
} }
else if (includeInStemming(span.start)) {
return normalizeJoiner(stemmedWords[span.start]);
}
else return "";
}
private boolean includeInStemming(int i) { private boolean includeInStemming(int i) {
if (posTags[i].equals("IN") || posTags[i].equals("TO") || posTags[i].equals("CC") || posTags[i].equals("DT")) { if (posTags[i].equals("IN") || posTags[i].equals("TO") || posTags[i].equals("CC") || posTags[i].equals("DT")) {

View File

@ -5,16 +5,21 @@ import lombok.EqualsAndHashCode;
import lombok.Getter; import lombok.Getter;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import java.util.Objects;
@AllArgsConstructor @EqualsAndHashCode @Getter @AllArgsConstructor @EqualsAndHashCode @Getter
public class WordRep implements Comparable<WordRep> { public class WordRep implements Comparable<WordRep> {
public WordRep(DocumentSentence sent, WordSpan span) { public WordRep(DocumentSentence sent, WordSpan span) {
word = sent.constructWordFromSpan(span); word = sent.constructWordFromSpan(span);
stemmed = sent.constructStemmedWordFromSpan(span); stemmed = sent.constructStemmedWordFromSpan(span);
length = span.end - span.start; length = span.end - span.start;
hashCode = Objects.hash(word);
} }
public final int length; public final int length;
public final String word; public final String word;
public final String stemmed; public final String stemmed;
private final int hashCode;
@Override @Override
public int compareTo(@NotNull WordRep o) { public int compareTo(@NotNull WordRep o) {
@ -25,4 +30,8 @@ public class WordRep implements Comparable<WordRep> {
public String toString() { public String toString() {
return word; return word;
} }
public int hashCode() {
return hashCode;
}
} }

View File

@ -112,7 +112,8 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
rsp.getInt(11), // dataHash rsp.getInt(11), // dataHash
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
Integer.MAX_VALUE, // rankingId Integer.MAX_VALUE, // rankingId
Double.MAX_VALUE // termScore Double.MAX_VALUE, // termScore
1 // resultsFromSameDomain
); );
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
&& Strings.isNullOrEmpty(val.description) && Strings.isNullOrEmpty(val.description)

View File

@ -32,6 +32,7 @@ import spark.Spark;
import java.util.*; import java.util.*;
import java.util.function.LongPredicate; import java.util.function.LongPredicate;
import java.util.stream.Collectors;
import static java.util.Comparator.comparing; import static java.util.Comparator.comparing;
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
@ -184,14 +185,24 @@ public class EdgeIndexQueryService {
} }
cachePool.clear(); cachePool.clear();
return results.stream() List<EdgeSearchResultItem> resultList = results.stream()
.sorted( .sorted(
comparing(EdgeSearchResultItem::getScore) comparing(EdgeSearchResultItem::getScore)
.thenComparing(EdgeSearchResultItem::getRanking) .thenComparing(EdgeSearchResultItem::getRanking)
.thenComparing(EdgeSearchResultItem::getUrlIdInt) .thenComparing(EdgeSearchResultItem::getUrlIdInt)
) )
.filter(domainCountFilter::test) .filter(domainCountFilter::test)
.limit(specsSet.getLimitTotal()).toList(); .collect(Collectors.toList());
if (resultList.size() > specsSet.getLimitTotal()) {
resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear();
}
for (var result : resultList) {
result.resultsFromDomain = domainCountFilter.getCount(result);
}
return resultList;
} }

View File

@ -36,15 +36,18 @@ public class ResultDomainDeduplicator {
} }
public boolean test(EdgeSearchResultItem item) { public boolean test(EdgeSearchResultItem item) {
final int ranking = item.getRanking(); final long key = item.deduplicationKey();
if (ranking == Integer.MAX_VALUE) { if (key == 0)
return true; return true;
}
// For ResultItems, consider bucketId as well as different buckets may use different
// ranking algorithms
final long key = ranking*32L + item.bucketId;
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain; return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
} }
public int getCount(EdgeSearchResultItem item) {
final long key = item.deduplicationKey();
if (key == 0)
return 1;
return resultsByRankingId.get(key);
}
} }

View File

@ -16,6 +16,8 @@ public class EdgeSearchResultItem {
public final List<EdgeSearchResultKeywordScore> scores; public final List<EdgeSearchResultKeywordScore> scores;
public int resultsFromDomain;
public EdgeSearchResultItem(int bucketId, long val) { public EdgeSearchResultItem(int bucketId, long val) {
this.bucketId = bucketId; this.bucketId = bucketId;
this.combinedId = val; this.combinedId = val;
@ -32,6 +34,7 @@ public class EdgeSearchResultItem {
public int getRanking() { public int getRanking() {
return (int)(combinedId >>> 32); return (int)(combinedId >>> 32);
} }
public int getResultsFromDomain() { return resultsFromDomain; }
/* Used for evaluation */ /* Used for evaluation */
private transient double scoreValue = 1; private transient double scoreValue = 1;
@ -56,4 +59,14 @@ public class EdgeSearchResultItem {
} }
return false; return false;
} }
public long deduplicationKey() {
final int ranking = getRanking();
if (ranking == Integer.MAX_VALUE) {
return 0;
}
return ranking*32L + bucketId;
}
} }

View File

@ -30,6 +30,12 @@ public class EdgeUrlDetails {
public long rankingId; public long rankingId;
public double termScore; public double termScore;
public int resultsFromSameDomain;
public boolean hasMoreResults() {
return resultsFromSameDomain > 1;
}
public long rankingIdAdjustment() { public long rankingIdAdjustment() {
int penalty = 0; int penalty = 0;

View File

@ -67,7 +67,12 @@ public class SiteListCommand implements SearchCommandInterface {
resultSet = Collections.emptyList(); resultSet = Collections.emptyList();
} }
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString()))); return Optional.of(siteInfoRenderer.render(results, Map.of("query", query,
"hideRanking", true,
"focusDomain", Objects.requireNonNullElse(domain, ""),
"profile", parameters.profileStr(),
"results", resultSet, "screenshot",
screenshotPath == null ? "" : screenshotPath.toString())));
} }

View File

@ -59,6 +59,7 @@ public class SearchResultDecorator {
details.rankingId = rankingId; details.rankingId = rankingId;
} }
details.resultsFromSameDomain = resultItem.resultsFromDomain;
details.termScore = calculateTermScore(resultItem, details); details.termScore = calculateTermScore(resultItem, details);
logger.debug("{} -> {}", details.url, details.termScore); logger.debug("{} -> {}", details.url, details.termScore);

View File

@ -1,12 +0,0 @@
<section class="card browse-result rightbox">
<h2>{{url.domain}}</h2>
<a href="{{url.proto}}://{{url.domain}}/">
<img src="/screenshot/{{domainId}}" title="{{description}}" loading="lazy"/>
</a>
<div class="utils">
<a href="/site/{{url.domain}}">Info</a>
<a href="/explore/{{url.domain}}">Similar Domains</a>
</div>
</section>

View File

@ -5,6 +5,6 @@
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}} {{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
{{#if ads}}<abbr title="possible annoying ads or popovers (experimental)" class="meta">⚠️️️</abbr>{{/if}} {{#if ads}}<abbr title="possible annoying ads or popovers (experimental)" class="meta">⚠️️️</abbr>{{/if}}
<span class="meta">{{format}}</span> <span class="meta">{{format}}</span>
{{#unless focusDomain}} {{#unless hideRanking}}
<span class="rank-symbol" title="{{rankingSymbolDesc}}">{{{rankingSymbol}}}</span> <span class="rank-symbol" title="{{rankingSymbolDesc}}">{{{rankingSymbol}}}</span>
{{/unless}} {{/unless}}

View File

@ -6,8 +6,7 @@
<div class="utils"> <div class="utils">
<a href="/site/{{url.domain}}" title="Domain Information">Info</a> <a href="/site/{{url.domain}}" title="Domain Information">Info</a>
{{#unless focusDomain}}<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="Narrow the search to this domain">Search</a>{{/unless}} {{#unless focusDomain}}{{#if hasMoreResults}}<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="More results from this domain">{{resultsFromSameDomain}}+</a>{{/if}}{{/unless}}
<div class="meta">{{>edge/search-result-metadata}}</div> <div class="meta">{{>edge/search-result-metadata}}</div>
</div> </div>
<hr class="w3m-helper" /> <hr class="w3m-helper" />

View File

@ -19,7 +19,6 @@
<article> <article>
{{>edge/parts/search-form}} {{>edge/parts/search-form}}
{{#each domainResults}}{{>edge/browse-result-rb}}{{/each}}
<section class="cards"> <section class="cards">
{{#if maintenanceMessage}}<section class="card problems onlyscreen"><h2>Maintenance</h2><p class="description">{{maintenanceMessage}}</p></section>{{/if}} {{#if maintenanceMessage}}<section class="card problems onlyscreen"><h2>Maintenance</h2><p class="description">{{maintenanceMessage}}</p></section>{{/if}}
@ -41,6 +40,7 @@
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}} {{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}
{{#each domainResults}}{{>edge/browse-result}}{{/each}}
{{#each results}}{{>edge/search-result}}{{/each}} {{#each results}}{{>edge/search-result}}{{/each}}
</section> </section>

View File

@ -38,6 +38,31 @@ class SentenceExtractorTest {
legacySe.setLegacyMode(true); legacySe.setLegacyMode(true);
} }
public static void main(String... args) throws IOException {
final LanguageModels lm = TestLanguageModels.getLanguageModels();
var data = Path.of("/home/vlofgren/Code/tmp-data/");
System.out.println("Running");
SentenceExtractor se = new SentenceExtractor(lm);
var dict = new TermFrequencyDict(lm);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
for (;;) {
long total = 0;
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
var doc = Jsoup.parse(Files.readString(file.toPath()));
long start = System.currentTimeMillis();
var dld = se.extractSentences(doc);
documentKeywordExtractor.extractKeywords(dld);
total += (System.currentTimeMillis() - start);
}
System.out.println(total);
}
}
@SneakyThrows @SneakyThrows
@Test @Test
void testExtractSubject() { void testExtractSubject() {

View File

@ -1,8 +1,6 @@
package com.github.datquocnguyen; package com.github.datquocnguyen;
import java.util.HashMap; import java.util.HashMap;
import java.util.function.Predicate;
import java.util.regex.Pattern;
/** GPLv3 /** GPLv3
* @author DatQuocNguyen * @author DatQuocNguyen
@ -10,18 +8,106 @@ import java.util.regex.Pattern;
*/ */
public class InitialTagger public class InitialTagger
{ {
private static final Pattern QUOTATION = Pattern.compile("(“)|(”)|(\")"); static public boolean jj1(String s) {
int idx = s.indexOf('-');
while (idx >= 0) {
if (idx > 0 && isDigit(s.charAt(idx-1)))
return true;
if (idx+1 < s.length() && isDigit(s.charAt(idx+1)))
return true;
private static final Predicate<String> CD = Pattern.compile("[0-9]+").asPredicate(); idx = s.indexOf('-', idx+1);
private static final Predicate<String> URL = Pattern.compile("[A-Za-z]\\w*(\\.[A-Za-z]\\w+)+").asPredicate(); }
private static final Predicate<String> JJ1 = Pattern.compile("([0-9]+-)|(-[0-9]+)").asPredicate(); return false;
private static final Predicate<String> JJ2 = Pattern.compile("(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)").asPredicate(); }
private static final Predicate<String> JJ3 = Pattern.compile("(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)").asPredicate();
private static final Predicate<String> NN = Pattern.compile("(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)").asPredicate(); static public boolean nn(String s) {
private static final Predicate<String> NNS = Pattern.compile(".*s$").asPredicate(); if (s.endsWith("ness"))
private static final Predicate<String> VBG = Pattern.compile(".*ing$").asPredicate(); return true;
private static final Predicate<String> VBN = Pattern.compile(".*ed$").asPredicate(); if (s.endsWith("ment"))
private static final Predicate<String> RB = Pattern.compile(".*ly$").asPredicate(); return true;
if (s.endsWith("ship"))
return true;
if (s.startsWith("Ex"))
return true;
if (s.startsWith("ex"))
return true;
if (s.startsWith("Self-"))
return true;
if (s.startsWith("self-"))
return true;
return false;
}
static public boolean jj2(String s) {
if (s.startsWith("Inter"))
return true;
if (s.startsWith("inter"))
return true;
if (s.startsWith("Dis"))
return true;
if (s.startsWith("dis"))
return true;
if (s.startsWith("Anti"))
return true;
if (s.startsWith("anti"))
return true;
return false;
}
static public boolean jj3(String s) {
if (s.contains("-"))
return true;
if (s.endsWith("ful"))
return true;
if (s.endsWith("ous"))
return true;
if (s.endsWith("ble"))
return true;
if (s.endsWith("ic"))
return true;
if (s.endsWith("ive"))
return true;
if (s.endsWith("est"))
return true;
if (s.endsWith("able"))
return true;
if (s.endsWith("al"))
return true;
return false;
}
static public boolean url(String s) {
int pointIdx = s.indexOf('.');
return pointIdx >= 0 && pointIdx != s.length()-1;
}
static public boolean cd(String s) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (isDigit(c)) {
return true;
}
}
return false;
}
public static boolean isDigit(char c) {
return c >= '0' && c <= '9';
}
static public boolean rb(String s) {
return s.endsWith("ly");
}
static public boolean vbn(String s) {
return s.endsWith("vbn");
}
static public boolean vbg(String s) {
return s.endsWith("vbg");
}
static public boolean nns(String s) {
return Character.isLowerCase(s.charAt(0)) && s.endsWith("s");
}
public static String[] EnInitTagger4Sentence( public static String[] EnInitTagger4Sentence(
HashMap<String, String> DICT, String[] sentence) HashMap<String, String> DICT, String[] sentence)
@ -35,9 +121,9 @@ public class InitialTagger
} }
private static String getTagForWordEn(HashMap<String, String> DICT, String word) { private static String getTagForWordEn(HashMap<String, String> DICT, String word) {
if (QUOTATION.matcher(word).find()) { if (word.contains("\"") || word.contains("") || word.contains(""))
return DICT.get("''"); return DICT.get("''");
}
if ("[]()<>!".contains(word)) { if ("[]()<>!".contains(word)) {
return "?"; return "?";
} }
@ -47,28 +133,27 @@ public class InitialTagger
String lowerW = word.toLowerCase(); String lowerW = word.toLowerCase();
if (DICT.containsKey(lowerW)) if (DICT.containsKey(lowerW))
return DICT.get(lowerW); return DICT.get(lowerW);
if (JJ1.test(word)) if (jj1(word))
return "JJ"; return "JJ";
if (URL.test(word)) if (url(word))
return "NN"; return "NN";
if (CD.test(word)) if (cd(word))
return "CD"; return "CD";
if (NN.test(word)) if (nn(word))
return "NN"; return "NN";
if (NNS.test(word) if (nns(word))
&& Character.isLowerCase(word.charAt(0)))
return "NNS"; return "NNS";
if (Character.isUpperCase(word.charAt(0))) if (Character.isUpperCase(word.charAt(0)))
return "NNP"; return "NNP";
if (JJ2.test(word)) if (jj2(word))
return "JJ"; return "JJ";
if (VBG.test(word)) if (vbg(word))
return "VBG"; return "VBG";
if (VBN.test(word)) if (vbn(word))
return "VBN"; return "VBN";
if (word.contains("-") || JJ3.test(word)) if (jj3(word))
return "JJ"; return "JJ";
if (RB.test(word)) if (rb(word))
return "RB"; return "RB";
return "NN"; return "NN";