mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
UX improvements for "show more results".
This commit is contained in:
parent
2e740bb7bd
commit
9558077808
@ -28,11 +28,11 @@ import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
||||
|
||||
@Tag("e2e")
|
||||
@Testcontainers
|
||||
@ -188,7 +188,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
System.out.println(driver.getTitle());
|
||||
|
||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
|
||||
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
|
||||
}
|
||||
@ -201,7 +201,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
System.out.println(driver.getTitle());
|
||||
|
||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||
assertNotEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
|
||||
assertEquals(Collections.emptyList(), getTitlesFromSearchResults(html));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js"));
|
||||
}
|
||||
@ -214,7 +214,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
System.out.println(driver.getTitle());
|
||||
|
||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
|
||||
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js"));
|
||||
}
|
||||
|
@ -31,7 +31,7 @@ public class WordPatterns {
|
||||
|
||||
public static final Set<String> topWords;
|
||||
static {
|
||||
topWords = new HashSet<>(200);
|
||||
topWords = new HashSet<>(200, 0.25f);
|
||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-stopwords"),
|
||||
"Could not load word frequency table");
|
||||
var br = new BufferedReader(new InputStreamReader(resource))
|
||||
@ -87,11 +87,33 @@ public class WordPatterns {
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean hasWordQualities(String s) {
|
||||
int start = 0;
|
||||
int end = s.length();
|
||||
if (s.charAt(0) == '#') start++;
|
||||
if (end > 1 && s.charAt(end-1) == '#') end--;
|
||||
|
||||
for (int i = start; i < end; i++) {
|
||||
char c = s.charAt(i);
|
||||
if (!("_@.'+-".indexOf(c) >= 0)
|
||||
&& !(c >= 'a' && c <= 'z')
|
||||
&& !(c >= 'A' && c <= 'Z')
|
||||
&& !(c >= '0' && c <= '9')
|
||||
&& !(c >= '\u00C0' && c <= '\u00D6')
|
||||
&& !(c >= '\u00D8' && c <= '\u00f6')
|
||||
&& !(c >= '\u00f8' && c <= '\u00ff')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean isStopWord(String s) {
|
||||
if (s.length() < MIN_WORD_LENGTH) {
|
||||
return true;
|
||||
}
|
||||
if (!wordQualitiesPredicate.test(s)) {
|
||||
if (!hasWordQualities(s)) {
|
||||
return true;
|
||||
}
|
||||
if (!filter(s)) {
|
||||
|
@ -1,16 +1,23 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class AsciiFlattener {
|
||||
|
||||
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+");
|
||||
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$");
|
||||
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
|
||||
|
||||
private static boolean isPlainAscii(String s) {
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char c = s.charAt(i);
|
||||
if ((c & 0x80) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
public static String flattenUnicode(String s) {
|
||||
if (plainAscii.test(s)) {
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class DocumentKeywordExtractor {
|
||||
@ -156,13 +157,16 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
||||
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (var word : sent) {
|
||||
String lc = word.wordLowerCase();
|
||||
if (lc.matches("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+")) {
|
||||
if (lc.length() > 6
|
||||
&& lc.indexOf('@') > 0
|
||||
&& mailLikePattern.matcher(lc).matches()) {
|
||||
reps.add(lc);
|
||||
|
||||
String domain = lc.substring(lc.indexOf('@'));
|
||||
@ -189,6 +193,6 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns::hasWordQualities).collect(Collectors.toSet()));
|
||||
}
|
||||
}
|
||||
|
@ -23,8 +23,8 @@ public class KeywordCounter {
|
||||
}
|
||||
|
||||
public WordHistogram countHisto(DocumentLanguageData dld) {
|
||||
HashMap<String, Integer> counts = new HashMap<>(1000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||
HashMap<String, Integer> counts = new HashMap<>(15000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
|
||||
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
@ -37,15 +37,15 @@ public class KeywordCounter {
|
||||
String stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
|
||||
counts.merge(stemmed, 1, Integer::sum);
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>(500)).add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
|
||||
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
|
||||
|
||||
Set<WordRep> h5 = new HashSet<>();
|
||||
Set<WordRep> h10 = new HashSet<>();
|
||||
Set<WordRep> h15 = new HashSet<>();
|
||||
Set<WordRep> h5 = new HashSet<>(2500);
|
||||
Set<WordRep> h10 = new HashSet<>(500);
|
||||
Set<WordRep> h15 = new HashSet<>(500);
|
||||
|
||||
int doubleWordCount = 0;
|
||||
|
||||
@ -65,13 +65,14 @@ public class KeywordCounter {
|
||||
|
||||
histogram.addAll(instances.get(wordStemmed));
|
||||
}
|
||||
|
||||
return new WordHistogram(h5, h10, h15);
|
||||
}
|
||||
|
||||
private static final Pattern separator = Pattern.compile("_");
|
||||
|
||||
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
|
||||
String key = e.getKey();
|
||||
if (key.contains("_")) {
|
||||
String[] parts = separator.split(e.getKey());
|
||||
double totalValue = 0.;
|
||||
for (String part : parts) {
|
||||
@ -79,6 +80,10 @@ public class KeywordCounter {
|
||||
}
|
||||
return totalValue / parts.length;
|
||||
}
|
||||
else {
|
||||
return value(key, e.getValue(), maxValue);
|
||||
}
|
||||
}
|
||||
|
||||
double value(String key, double value, double maxValue) {
|
||||
double freq = dict.getTermFreqStemmed(key);
|
||||
|
@ -6,10 +6,10 @@ import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import opennlp.tools.stemmer.PorterStemmer;
|
||||
@ -127,8 +127,9 @@ public class SentenceExtractor {
|
||||
|
||||
private static final Pattern dotPattern = Pattern.compile("\\.+$");
|
||||
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
||||
private static final Pattern spacesPattern = Pattern.compile("\\s+");
|
||||
|
||||
private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
|
||||
private static final Pattern possessivePattern = Pattern.compile("'(s)?$");
|
||||
|
||||
public DocumentSentence extractSentence(String text) {
|
||||
var wordsAndSeps = splitSegment(text);
|
||||
@ -142,10 +143,20 @@ public class SentenceExtractor {
|
||||
);
|
||||
}
|
||||
|
||||
public String normalizeSpaces(String s) {
|
||||
if (s.indexOf('\t') >= 0) {
|
||||
s = s.replace('\t', ' ');
|
||||
}
|
||||
if (s.indexOf('\n') >= 0) {
|
||||
s = s.replace('\n', ' ');
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public DocumentSentence[] extractSentencesFromString(String text) {
|
||||
String[] sentences;
|
||||
|
||||
String textNormalizedSpaces = text.replaceAll("\\s", " ");
|
||||
String textNormalizedSpaces = normalizeSpaces(text);
|
||||
try {
|
||||
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
|
||||
}
|
||||
@ -157,10 +168,17 @@ public class SentenceExtractor {
|
||||
sentences = Arrays.copyOf(sentences, 250);
|
||||
}
|
||||
|
||||
sentences = Arrays.stream(sentences)
|
||||
.filter(s -> !s.isBlank())
|
||||
.flatMap(s -> Arrays.stream(splitPattern.split(s)))
|
||||
.toArray(String[]::new);
|
||||
List<String> sentenceList = new ArrayList<>();
|
||||
for (var s : sentences) {
|
||||
if (s.isBlank()) continue;
|
||||
if (s.contains("-") || s.contains("|")) {
|
||||
sentenceList.addAll(Arrays.asList(splitPattern.split(s)));
|
||||
}
|
||||
else {
|
||||
sentenceList.add(s);
|
||||
}
|
||||
}
|
||||
sentences = sentenceList.toArray(String[]::new);
|
||||
|
||||
final String[][] tokens = new String[sentences.length][];
|
||||
final int[][] separators = new int[sentences.length][];
|
||||
@ -178,9 +196,11 @@ public class SentenceExtractor {
|
||||
separators[i] = Arrays.copyOf(separators[i], 250);
|
||||
}
|
||||
for (int j = 0; j < tokens[i].length; j++) {
|
||||
if (tokens[i][j].endsWith(".")) {
|
||||
tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll("");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
|
||||
@ -204,7 +224,7 @@ public class SentenceExtractor {
|
||||
private String[] stemSentence(String[] strings) {
|
||||
String[] stemmed = new String[strings.length];
|
||||
for (int i = 0; i < stemmed.length; i++) {
|
||||
var sent = possessivePattern.matcher(strings[i]).replaceAll("");
|
||||
var sent = cleanPossessive(strings[i]);
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(sent);
|
||||
}
|
||||
@ -215,10 +235,23 @@ public class SentenceExtractor {
|
||||
return stemmed;
|
||||
}
|
||||
|
||||
private String cleanPossessive(String s) {
|
||||
int end = s.length();
|
||||
|
||||
if (s.endsWith("\'")) {
|
||||
return s.substring(0, end-1);
|
||||
} else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) {
|
||||
return s.substring(0, end-2).toLowerCase();
|
||||
}
|
||||
else {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
private String[] toLc(String[] words) {
|
||||
String[] lower = new String[words.length];
|
||||
for (int i = 0; i < lower.length; i++) {
|
||||
lower[i] = possessivePattern.matcher(words[i].toLowerCase()).replaceAll("");
|
||||
lower[i] = cleanPossessive(words[i]).toLowerCase();
|
||||
}
|
||||
return lower;
|
||||
}
|
||||
|
@ -8,7 +8,6 @@ import java.lang.ref.SoftReference;
|
||||
import java.util.BitSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.StringJoiner;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
@ -53,27 +52,71 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
return words.length;
|
||||
}
|
||||
|
||||
private final static Pattern trailingJunkPattern = Pattern.compile("(^[\"'_*]+|[_*'\"]+$)");
|
||||
private final static Pattern joinerPattern = Pattern.compile("[-+.]+");
|
||||
private String removeJunk(String s) {
|
||||
int start = 0;
|
||||
int end = s.length();
|
||||
|
||||
for (; start < end; start++) {
|
||||
if ("\"'_*".indexOf(s.charAt(start)) < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
for (; end > start; end--) {
|
||||
if ("\"'_*".indexOf(s.charAt(end-1)) < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
if (start > 0 || end < s.length()) {
|
||||
return s.substring(start, end);
|
||||
}
|
||||
else {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
public String constructWordFromSpan(WordSpan span) {
|
||||
if (span.size() == 1) {
|
||||
return removeJunk(wordsLowerCase[span.start]);
|
||||
}
|
||||
else {
|
||||
StringJoiner sj = new StringJoiner("_");
|
||||
for (int i = span.start; i < span.end; i++) {
|
||||
sj.add(wordsLowerCase[i]);
|
||||
}
|
||||
|
||||
return trailingJunkPattern.matcher(sj.toString()).replaceAll("");
|
||||
return removeJunk(sj.toString());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private String normalizeJoiner(String s) {
|
||||
|
||||
if (s.indexOf('+') >= 0) {
|
||||
s = s.replace('+', '_');
|
||||
}
|
||||
if (s.indexOf('.') >= 0) {
|
||||
s = s.replace('.', '_');
|
||||
}
|
||||
if (s.indexOf('-') >= 0) {
|
||||
s = s.replace('-', '_');
|
||||
}
|
||||
return s;
|
||||
}
|
||||
public String constructStemmedWordFromSpan(WordSpan span) {
|
||||
if (span.size() > 1) {
|
||||
|
||||
StringJoiner sj = new StringJoiner("_");
|
||||
for (int i = span.start; i < span.end; i++) {
|
||||
if (includeInStemming(i))
|
||||
sj.add(joinerPattern.matcher(stemmedWords[i]).replaceAll("_"));
|
||||
sj.add(normalizeJoiner(stemmedWords[i]));
|
||||
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
else if (includeInStemming(span.start)) {
|
||||
return normalizeJoiner(stemmedWords[span.start]);
|
||||
}
|
||||
else return "";
|
||||
}
|
||||
|
||||
private boolean includeInStemming(int i) {
|
||||
if (posTags[i].equals("IN") || posTags[i].equals("TO") || posTags[i].equals("CC") || posTags[i].equals("DT")) {
|
||||
|
@ -5,16 +5,21 @@ import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
@AllArgsConstructor @EqualsAndHashCode @Getter
|
||||
public class WordRep implements Comparable<WordRep> {
|
||||
|
||||
public WordRep(DocumentSentence sent, WordSpan span) {
|
||||
word = sent.constructWordFromSpan(span);
|
||||
stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
length = span.end - span.start;
|
||||
hashCode = Objects.hash(word);
|
||||
}
|
||||
public final int length;
|
||||
public final String word;
|
||||
public final String stemmed;
|
||||
private final int hashCode;
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull WordRep o) {
|
||||
@ -25,4 +30,8 @@ public class WordRep implements Comparable<WordRep> {
|
||||
public String toString() {
|
||||
return word;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return hashCode;
|
||||
}
|
||||
}
|
||||
|
@ -112,7 +112,8 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
rsp.getInt(11), // dataHash
|
||||
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
||||
Integer.MAX_VALUE, // rankingId
|
||||
Double.MAX_VALUE // termScore
|
||||
Double.MAX_VALUE, // termScore
|
||||
1 // resultsFromSameDomain
|
||||
);
|
||||
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
|
||||
&& Strings.isNullOrEmpty(val.description)
|
||||
|
@ -32,6 +32,7 @@ import spark.Spark;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.LongPredicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static java.util.Comparator.comparing;
|
||||
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
|
||||
@ -184,14 +185,24 @@ public class EdgeIndexQueryService {
|
||||
}
|
||||
cachePool.clear();
|
||||
|
||||
return results.stream()
|
||||
List<EdgeSearchResultItem> resultList = results.stream()
|
||||
.sorted(
|
||||
comparing(EdgeSearchResultItem::getScore)
|
||||
.thenComparing(EdgeSearchResultItem::getRanking)
|
||||
.thenComparing(EdgeSearchResultItem::getUrlIdInt)
|
||||
)
|
||||
.filter(domainCountFilter::test)
|
||||
.limit(specsSet.getLimitTotal()).toList();
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (resultList.size() > specsSet.getLimitTotal()) {
|
||||
resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear();
|
||||
}
|
||||
|
||||
for (var result : resultList) {
|
||||
result.resultsFromDomain = domainCountFilter.getCount(result);
|
||||
}
|
||||
|
||||
return resultList;
|
||||
}
|
||||
|
||||
|
||||
|
@ -36,15 +36,18 @@ public class ResultDomainDeduplicator {
|
||||
}
|
||||
|
||||
public boolean test(EdgeSearchResultItem item) {
|
||||
final int ranking = item.getRanking();
|
||||
if (ranking == Integer.MAX_VALUE) {
|
||||
final long key = item.deduplicationKey();
|
||||
if (key == 0)
|
||||
return true;
|
||||
}
|
||||
|
||||
// For ResultItems, consider bucketId as well as different buckets may use different
|
||||
// ranking algorithms
|
||||
final long key = ranking*32L + item.bucketId;
|
||||
|
||||
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
|
||||
}
|
||||
|
||||
public int getCount(EdgeSearchResultItem item) {
|
||||
final long key = item.deduplicationKey();
|
||||
if (key == 0)
|
||||
return 1;
|
||||
|
||||
return resultsByRankingId.get(key);
|
||||
}
|
||||
}
|
||||
|
@ -16,6 +16,8 @@ public class EdgeSearchResultItem {
|
||||
|
||||
public final List<EdgeSearchResultKeywordScore> scores;
|
||||
|
||||
public int resultsFromDomain;
|
||||
|
||||
public EdgeSearchResultItem(int bucketId, long val) {
|
||||
this.bucketId = bucketId;
|
||||
this.combinedId = val;
|
||||
@ -32,6 +34,7 @@ public class EdgeSearchResultItem {
|
||||
public int getRanking() {
|
||||
return (int)(combinedId >>> 32);
|
||||
}
|
||||
public int getResultsFromDomain() { return resultsFromDomain; }
|
||||
|
||||
/* Used for evaluation */
|
||||
private transient double scoreValue = 1;
|
||||
@ -56,4 +59,14 @@ public class EdgeSearchResultItem {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public long deduplicationKey() {
|
||||
final int ranking = getRanking();
|
||||
|
||||
if (ranking == Integer.MAX_VALUE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return ranking*32L + bucketId;
|
||||
}
|
||||
}
|
||||
|
@ -30,6 +30,12 @@ public class EdgeUrlDetails {
|
||||
public long rankingId;
|
||||
public double termScore;
|
||||
|
||||
public int resultsFromSameDomain;
|
||||
|
||||
public boolean hasMoreResults() {
|
||||
return resultsFromSameDomain > 1;
|
||||
}
|
||||
|
||||
public long rankingIdAdjustment() {
|
||||
int penalty = 0;
|
||||
|
||||
|
@ -67,7 +67,12 @@ public class SiteListCommand implements SearchCommandInterface {
|
||||
resultSet = Collections.emptyList();
|
||||
}
|
||||
|
||||
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString())));
|
||||
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query,
|
||||
"hideRanking", true,
|
||||
"focusDomain", Objects.requireNonNullElse(domain, ""),
|
||||
"profile", parameters.profileStr(),
|
||||
"results", resultSet, "screenshot",
|
||||
screenshotPath == null ? "" : screenshotPath.toString())));
|
||||
}
|
||||
|
||||
|
||||
|
@ -59,6 +59,7 @@ public class SearchResultDecorator {
|
||||
details.rankingId = rankingId;
|
||||
}
|
||||
|
||||
details.resultsFromSameDomain = resultItem.resultsFromDomain;
|
||||
details.termScore = calculateTermScore(resultItem, details);
|
||||
|
||||
logger.debug("{} -> {}", details.url, details.termScore);
|
||||
|
@ -1,12 +0,0 @@
|
||||
<section class="card browse-result rightbox">
|
||||
<h2>{{url.domain}}</h2>
|
||||
|
||||
<a href="{{url.proto}}://{{url.domain}}/">
|
||||
<img src="/screenshot/{{domainId}}" title="{{description}}" loading="lazy"/>
|
||||
</a>
|
||||
|
||||
<div class="utils">
|
||||
<a href="/site/{{url.domain}}">Info</a>
|
||||
<a href="/explore/{{url.domain}}">Similar Domains</a>
|
||||
</div>
|
||||
</section>
|
@ -5,6 +5,6 @@
|
||||
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
|
||||
{{#if ads}}<abbr title="possible annoying ads or popovers (experimental)" class="meta">⚠️️️</abbr>{{/if}}
|
||||
<span class="meta">{{format}}</span>
|
||||
{{#unless focusDomain}}
|
||||
{{#unless hideRanking}}
|
||||
<span class="rank-symbol" title="{{rankingSymbolDesc}}">{{{rankingSymbol}}}</span>
|
||||
{{/unless}}
|
@ -6,8 +6,7 @@
|
||||
|
||||
<div class="utils">
|
||||
<a href="/site/{{url.domain}}" title="Domain Information">Info</a>
|
||||
{{#unless focusDomain}}<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="Narrow the search to this domain">Search</a>{{/unless}}
|
||||
|
||||
{{#unless focusDomain}}{{#if hasMoreResults}}<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="More results from this domain">{{resultsFromSameDomain}}+</a>{{/if}}{{/unless}}
|
||||
<div class="meta">{{>edge/search-result-metadata}}</div>
|
||||
</div>
|
||||
<hr class="w3m-helper" />
|
||||
|
@ -19,7 +19,6 @@
|
||||
<article>
|
||||
{{>edge/parts/search-form}}
|
||||
|
||||
{{#each domainResults}}{{>edge/browse-result-rb}}{{/each}}
|
||||
|
||||
<section class="cards">
|
||||
{{#if maintenanceMessage}}<section class="card problems onlyscreen"><h2>Maintenance</h2><p class="description">{{maintenanceMessage}}</p></section>{{/if}}
|
||||
@ -41,6 +40,7 @@
|
||||
|
||||
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}
|
||||
|
||||
{{#each domainResults}}{{>edge/browse-result}}{{/each}}
|
||||
{{#each results}}{{>edge/search-result}}{{/each}}
|
||||
|
||||
</section>
|
||||
|
@ -38,6 +38,31 @@ class SentenceExtractorTest {
|
||||
legacySe.setLegacyMode(true);
|
||||
}
|
||||
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
var data = Path.of("/home/vlofgren/Code/tmp-data/");
|
||||
|
||||
System.out.println("Running");
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(lm);
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||
for (;;) {
|
||||
long total = 0;
|
||||
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
||||
var doc = Jsoup.parse(Files.readString(file.toPath()));
|
||||
long start = System.currentTimeMillis();
|
||||
var dld = se.extractSentences(doc);
|
||||
documentKeywordExtractor.extractKeywords(dld);
|
||||
total += (System.currentTimeMillis() - start);
|
||||
}
|
||||
System.out.println(total);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
||||
void testExtractSubject() {
|
||||
|
@ -1,8 +1,6 @@
|
||||
package com.github.datquocnguyen;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/** GPLv3
|
||||
* @author DatQuocNguyen
|
||||
@ -10,18 +8,106 @@ import java.util.regex.Pattern;
|
||||
*/
|
||||
public class InitialTagger
|
||||
{
|
||||
private static final Pattern QUOTATION = Pattern.compile("(“)|(”)|(\")");
|
||||
static public boolean jj1(String s) {
|
||||
int idx = s.indexOf('-');
|
||||
while (idx >= 0) {
|
||||
if (idx > 0 && isDigit(s.charAt(idx-1)))
|
||||
return true;
|
||||
if (idx+1 < s.length() && isDigit(s.charAt(idx+1)))
|
||||
return true;
|
||||
|
||||
private static final Predicate<String> CD = Pattern.compile("[0-9]+").asPredicate();
|
||||
private static final Predicate<String> URL = Pattern.compile("[A-Za-z]\\w*(\\.[A-Za-z]\\w+)+").asPredicate();
|
||||
private static final Predicate<String> JJ1 = Pattern.compile("([0-9]+-)|(-[0-9]+)").asPredicate();
|
||||
private static final Predicate<String> JJ2 = Pattern.compile("(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)").asPredicate();
|
||||
private static final Predicate<String> JJ3 = Pattern.compile("(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)").asPredicate();
|
||||
private static final Predicate<String> NN = Pattern.compile("(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)").asPredicate();
|
||||
private static final Predicate<String> NNS = Pattern.compile(".*s$").asPredicate();
|
||||
private static final Predicate<String> VBG = Pattern.compile(".*ing$").asPredicate();
|
||||
private static final Predicate<String> VBN = Pattern.compile(".*ed$").asPredicate();
|
||||
private static final Predicate<String> RB = Pattern.compile(".*ly$").asPredicate();
|
||||
idx = s.indexOf('-', idx+1);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static public boolean nn(String s) {
|
||||
if (s.endsWith("ness"))
|
||||
return true;
|
||||
if (s.endsWith("ment"))
|
||||
return true;
|
||||
if (s.endsWith("ship"))
|
||||
return true;
|
||||
if (s.startsWith("Ex"))
|
||||
return true;
|
||||
if (s.startsWith("ex"))
|
||||
return true;
|
||||
if (s.startsWith("Self-"))
|
||||
return true;
|
||||
if (s.startsWith("self-"))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
static public boolean jj2(String s) {
|
||||
if (s.startsWith("Inter"))
|
||||
return true;
|
||||
if (s.startsWith("inter"))
|
||||
return true;
|
||||
if (s.startsWith("Dis"))
|
||||
return true;
|
||||
if (s.startsWith("dis"))
|
||||
return true;
|
||||
if (s.startsWith("Anti"))
|
||||
return true;
|
||||
if (s.startsWith("anti"))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
static public boolean jj3(String s) {
|
||||
if (s.contains("-"))
|
||||
return true;
|
||||
if (s.endsWith("ful"))
|
||||
return true;
|
||||
if (s.endsWith("ous"))
|
||||
return true;
|
||||
if (s.endsWith("ble"))
|
||||
return true;
|
||||
if (s.endsWith("ic"))
|
||||
return true;
|
||||
if (s.endsWith("ive"))
|
||||
return true;
|
||||
if (s.endsWith("est"))
|
||||
return true;
|
||||
if (s.endsWith("able"))
|
||||
return true;
|
||||
if (s.endsWith("al"))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
static public boolean url(String s) {
|
||||
int pointIdx = s.indexOf('.');
|
||||
return pointIdx >= 0 && pointIdx != s.length()-1;
|
||||
}
|
||||
static public boolean cd(String s) {
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char c = s.charAt(i);
|
||||
if (isDigit(c)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isDigit(char c) {
|
||||
return c >= '0' && c <= '9';
|
||||
}
|
||||
|
||||
static public boolean rb(String s) {
|
||||
return s.endsWith("ly");
|
||||
}
|
||||
static public boolean vbn(String s) {
|
||||
return s.endsWith("vbn");
|
||||
}
|
||||
static public boolean vbg(String s) {
|
||||
return s.endsWith("vbg");
|
||||
}
|
||||
|
||||
static public boolean nns(String s) {
|
||||
return Character.isLowerCase(s.charAt(0)) && s.endsWith("s");
|
||||
}
|
||||
|
||||
public static String[] EnInitTagger4Sentence(
|
||||
HashMap<String, String> DICT, String[] sentence)
|
||||
@ -35,9 +121,9 @@ public class InitialTagger
|
||||
}
|
||||
|
||||
private static String getTagForWordEn(HashMap<String, String> DICT, String word) {
|
||||
if (QUOTATION.matcher(word).find()) {
|
||||
if (word.contains("\"") || word.contains("“") || word.contains("”"))
|
||||
return DICT.get("''");
|
||||
}
|
||||
|
||||
if ("[]()<>!".contains(word)) {
|
||||
return "?";
|
||||
}
|
||||
@ -47,28 +133,27 @@ public class InitialTagger
|
||||
String lowerW = word.toLowerCase();
|
||||
if (DICT.containsKey(lowerW))
|
||||
return DICT.get(lowerW);
|
||||
if (JJ1.test(word))
|
||||
if (jj1(word))
|
||||
return "JJ";
|
||||
if (URL.test(word))
|
||||
if (url(word))
|
||||
return "NN";
|
||||
if (CD.test(word))
|
||||
if (cd(word))
|
||||
return "CD";
|
||||
if (NN.test(word))
|
||||
if (nn(word))
|
||||
return "NN";
|
||||
if (NNS.test(word)
|
||||
&& Character.isLowerCase(word.charAt(0)))
|
||||
if (nns(word))
|
||||
return "NNS";
|
||||
if (Character.isUpperCase(word.charAt(0)))
|
||||
return "NNP";
|
||||
if (JJ2.test(word))
|
||||
if (jj2(word))
|
||||
return "JJ";
|
||||
if (VBG.test(word))
|
||||
if (vbg(word))
|
||||
return "VBG";
|
||||
if (VBN.test(word))
|
||||
if (vbn(word))
|
||||
return "VBN";
|
||||
if (word.contains("-") || JJ3.test(word))
|
||||
if (jj3(word))
|
||||
return "JJ";
|
||||
if (RB.test(word))
|
||||
if (rb(word))
|
||||
return "RB";
|
||||
|
||||
return "NN";
|
||||
|
Loading…
Reference in New Issue
Block a user