diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index 11b51eef..cc238354 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -28,11 +28,11 @@ import java.nio.file.Path; import java.time.Duration; import java.time.LocalDateTime; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; @Tag("e2e") @Testcontainers @@ -188,7 +188,7 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html)); + assertEquals(List.of("Bird"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query")); } @@ -201,7 +201,7 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - assertNotEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html)); + assertEquals(Collections.emptyList(), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js")); } @@ -214,7 +214,7 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html)); + assertEquals(List.of("Bird"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js")); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java index e09c7709..d87f304f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java @@ -31,7 +31,7 @@ public class WordPatterns { public static final Set topWords; static { - topWords = new HashSet<>(200); + topWords = new HashSet<>(200, 0.25f); try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-stopwords"), "Could not load word frequency table"); var br = new BufferedReader(new InputStreamReader(resource)) @@ -87,11 +87,33 @@ public class WordPatterns { return true; } + public static boolean hasWordQualities(String s) { + int start = 0; + int end = s.length(); + if (s.charAt(0) == '#') start++; + if (end > 1 && s.charAt(end-1) == '#') end--; + + for (int i = start; i < end; i++) { + char c = s.charAt(i); + if (!("_@.'+-".indexOf(c) >= 0) + && !(c >= 'a' && c <= 'z') + && !(c >= 'A' && c <= 'Z') + && !(c >= '0' && c <= '9') + && !(c >= '\u00C0' && c <= '\u00D6') + && !(c >= '\u00D8' && c <= '\u00f6') + && !(c >= '\u00f8' && c <= '\u00ff')) { + return false; + } + } + + return true; + } + public static boolean isStopWord(String s) { if (s.length() < MIN_WORD_LENGTH) { return true; } - if (!wordQualitiesPredicate.test(s)) { + if (!hasWordQualities(s)) { return true; } if (!filter(s)) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java index ca53c7d8..6abcbdb5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java @@ -1,16 +1,23 @@ package nu.marginalia.util.language.processing; -import java.util.function.Predicate; import java.util.regex.Pattern; public class AsciiFlattener { private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+"); - private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$"); - private static final Predicate plainAscii = plainAsciiPattern.asMatchPredicate(); + private static boolean isPlainAscii(String s) { + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if ((c & 0x80) != 0) { + return false; + } + } + return true; + } public static String flattenUnicode(String s) { - if (plainAscii.test(s)) { + + if (isPlainAscii(s)) { return s; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 2626e2e8..58b7c198 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import javax.inject.Inject; import java.util.*; +import java.util.regex.Pattern; import java.util.stream.Collectors; public class DocumentKeywordExtractor { @@ -156,13 +157,16 @@ public class DocumentKeywordExtractor { } } + private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+"); private Collection getArtifacts(DocumentLanguageData documentLanguageData) { Set reps = new HashSet<>(); for (var sent : documentLanguageData.sentences) { for (var word : sent) { String lc = word.wordLowerCase(); - if (lc.matches("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+")) { + if (lc.length() > 6 + && lc.indexOf('@') > 0 + && mailLikePattern.matcher(lc).matches()) { reps.add(lc); String domain = lc.substring(lc.indexOf('@')); @@ -189,6 +193,6 @@ public class DocumentKeywordExtractor { } public EdgePageWords createWords(IndexBlock block, Collection words) { - return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet())); + return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns::hasWordQualities).collect(Collectors.toSet())); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java index 5a8af220..efa57bd2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java @@ -23,8 +23,8 @@ public class KeywordCounter { } public WordHistogram countHisto(DocumentLanguageData dld) { - HashMap counts = new HashMap<>(1000); - HashMap> instances = new HashMap<>(1000); + HashMap counts = new HashMap<>(15000); + HashMap> instances = new HashMap<>(15000); for (var sent : dld.sentences) { @@ -37,15 +37,15 @@ public class KeywordCounter { String stemmed = sent.constructStemmedWordFromSpan(span); counts.merge(stemmed, 1, Integer::sum); - instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span)); + instances.computeIfAbsent(stemmed, k -> new HashSet<>(500)).add(new WordRep(sent, span)); } } double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1); - Set h5 = new HashSet<>(); - Set h10 = new HashSet<>(); - Set h15 = new HashSet<>(); + Set h5 = new HashSet<>(2500); + Set h10 = new HashSet<>(500); + Set h15 = new HashSet<>(500); int doubleWordCount = 0; @@ -65,19 +65,24 @@ public class KeywordCounter { histogram.addAll(instances.get(wordStemmed)); } - return new WordHistogram(h5, h10, h15); } private static final Pattern separator = Pattern.compile("_"); public double getTermValue(Map.Entry e, double maxValue) { - String[] parts = separator.split(e.getKey()); - double totalValue = 0.; - for (String part : parts) { - totalValue += value(part, e.getValue(), maxValue); + String key = e.getKey(); + if (key.contains("_")) { + String[] parts = separator.split(e.getKey()); + double totalValue = 0.; + for (String part : parts) { + totalValue += value(part, e.getValue(), maxValue); + } + return totalValue / parts.length; + } + else { + return value(key, e.getValue(), maxValue); } - return totalValue / parts.length; } double value(String key, double value, double maxValue) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java index 6a5abe8a..87283f71 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java @@ -6,10 +6,10 @@ import gnu.trove.map.hash.TObjectIntHashMap; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.SneakyThrows; +import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import nu.marginalia.util.language.conf.LanguageModels; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; @@ -127,8 +127,9 @@ public class SentenceExtractor { private static final Pattern dotPattern = Pattern.compile("\\.+$"); private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); + private static final Pattern spacesPattern = Pattern.compile("\\s+"); + private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))"); - private static final Pattern possessivePattern = Pattern.compile("'(s)?$"); public DocumentSentence extractSentence(String text) { var wordsAndSeps = splitSegment(text); @@ -142,10 +143,20 @@ public class SentenceExtractor { ); } + public String normalizeSpaces(String s) { + if (s.indexOf('\t') >= 0) { + s = s.replace('\t', ' '); + } + if (s.indexOf('\n') >= 0) { + s = s.replace('\n', ' '); + } + return s; + } + public DocumentSentence[] extractSentencesFromString(String text) { String[] sentences; - String textNormalizedSpaces = text.replaceAll("\\s", " "); + String textNormalizedSpaces = normalizeSpaces(text); try { sentences = sentenceDetector.sentDetect(textNormalizedSpaces); } @@ -157,10 +168,17 @@ public class SentenceExtractor { sentences = Arrays.copyOf(sentences, 250); } - sentences = Arrays.stream(sentences) - .filter(s -> !s.isBlank()) - .flatMap(s -> Arrays.stream(splitPattern.split(s))) - .toArray(String[]::new); + List sentenceList = new ArrayList<>(); + for (var s : sentences) { + if (s.isBlank()) continue; + if (s.contains("-") || s.contains("|")) { + sentenceList.addAll(Arrays.asList(splitPattern.split(s))); + } + else { + sentenceList.add(s); + } + } + sentences = sentenceList.toArray(String[]::new); final String[][] tokens = new String[sentences.length][]; final int[][] separators = new int[sentences.length][]; @@ -178,7 +196,9 @@ public class SentenceExtractor { separators[i] = Arrays.copyOf(separators[i], 250); } for (int j = 0; j < tokens[i].length; j++) { - tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll( ""); + if (tokens[i][j].endsWith(".")) { + tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll(""); + } } } @@ -204,7 +224,7 @@ public class SentenceExtractor { private String[] stemSentence(String[] strings) { String[] stemmed = new String[strings.length]; for (int i = 0; i < stemmed.length; i++) { - var sent = possessivePattern.matcher(strings[i]).replaceAll(""); + var sent = cleanPossessive(strings[i]); try { stemmed[i] = porterStemmer.stem(sent); } @@ -215,10 +235,23 @@ public class SentenceExtractor { return stemmed; } + private String cleanPossessive(String s) { + int end = s.length(); + + if (s.endsWith("\'")) { + return s.substring(0, end-1); + } else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) { + return s.substring(0, end-2).toLowerCase(); + } + else { + return s; + } + } + private String[] toLc(String[] words) { String[] lower = new String[words.length]; for (int i = 0; i < lower.length; i++) { - lower[i] = possessivePattern.matcher(words[i].toLowerCase()).replaceAll(""); + lower[i] = cleanPossessive(words[i]).toLowerCase(); } return lower; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java index b56c5972..0f0ae0aa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java @@ -8,7 +8,6 @@ import java.lang.ref.SoftReference; import java.util.BitSet; import java.util.Iterator; import java.util.StringJoiner; -import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -53,26 +52,70 @@ public class DocumentSentence implements Iterable{ return words.length; } - private final static Pattern trailingJunkPattern = Pattern.compile("(^[\"'_*]+|[_*'\"]+$)"); - private final static Pattern joinerPattern = Pattern.compile("[-+.]+"); + private String removeJunk(String s) { + int start = 0; + int end = s.length(); - public String constructWordFromSpan(WordSpan span) { - StringJoiner sj = new StringJoiner("_"); - for (int i = span.start; i < span.end; i++) { - sj.add(wordsLowerCase[i]); + for (; start < end; start++) { + if ("\"'_*".indexOf(s.charAt(start)) < 0) + break; } - return trailingJunkPattern.matcher(sj.toString()).replaceAll(""); + for (; end > start; end--) { + if ("\"'_*".indexOf(s.charAt(end-1)) < 0) + break; + } + + if (start > 0 || end < s.length()) { + return s.substring(start, end); + } + else { + return s; + } } - public String constructStemmedWordFromSpan(WordSpan span) { - StringJoiner sj = new StringJoiner("_"); - for (int i = span.start; i < span.end; i++) { - if (includeInStemming(i)) - sj.add(joinerPattern.matcher(stemmedWords[i]).replaceAll("_")); - + public String constructWordFromSpan(WordSpan span) { + if (span.size() == 1) { + return removeJunk(wordsLowerCase[span.start]); } - return sj.toString(); + else { + StringJoiner sj = new StringJoiner("_"); + for (int i = span.start; i < span.end; i++) { + sj.add(wordsLowerCase[i]); + } + return removeJunk(sj.toString()); + } + } + + + private String normalizeJoiner(String s) { + + if (s.indexOf('+') >= 0) { + s = s.replace('+', '_'); + } + if (s.indexOf('.') >= 0) { + s = s.replace('.', '_'); + } + if (s.indexOf('-') >= 0) { + s = s.replace('-', '_'); + } + return s; + } + public String constructStemmedWordFromSpan(WordSpan span) { + if (span.size() > 1) { + + StringJoiner sj = new StringJoiner("_"); + for (int i = span.start; i < span.end; i++) { + if (includeInStemming(i)) + sj.add(normalizeJoiner(stemmedWords[i])); + + } + return sj.toString(); + } + else if (includeInStemming(span.start)) { + return normalizeJoiner(stemmedWords[span.start]); + } + else return ""; } private boolean includeInStemming(int i) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java index 2edb2d94..764d2fac 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java @@ -5,16 +5,21 @@ import lombok.EqualsAndHashCode; import lombok.Getter; import org.jetbrains.annotations.NotNull; +import java.util.Objects; + @AllArgsConstructor @EqualsAndHashCode @Getter public class WordRep implements Comparable { + public WordRep(DocumentSentence sent, WordSpan span) { word = sent.constructWordFromSpan(span); stemmed = sent.constructStemmedWordFromSpan(span); length = span.end - span.start; + hashCode = Objects.hash(word); } public final int length; public final String word; public final String stemmed; + private final int hashCode; @Override public int compareTo(@NotNull WordRep o) { @@ -25,4 +30,8 @@ public class WordRep implements Comparable { public String toString() { return word; } + + public int hashCode() { + return hashCode; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 701db4e5..a695073c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -112,7 +112,8 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { rsp.getInt(11), // dataHash EdgePageScoreAdjustment.zero(), // urlQualityAdjustment Integer.MAX_VALUE, // rankingId - Double.MAX_VALUE // termScore + Double.MAX_VALUE, // termScore + 1 // resultsFromSameDomain ); if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF && Strings.isNullOrEmpty(val.description) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java index 95af9493..7cbecc5d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -32,6 +32,7 @@ import spark.Spark; import java.util.*; import java.util.function.LongPredicate; +import java.util.stream.Collectors; import static java.util.Comparator.comparing; import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; @@ -184,14 +185,24 @@ public class EdgeIndexQueryService { } cachePool.clear(); - return results.stream() + List resultList = results.stream() .sorted( comparing(EdgeSearchResultItem::getScore) .thenComparing(EdgeSearchResultItem::getRanking) .thenComparing(EdgeSearchResultItem::getUrlIdInt) ) .filter(domainCountFilter::test) - .limit(specsSet.getLimitTotal()).toList(); + .collect(Collectors.toList()); + + if (resultList.size() > specsSet.getLimitTotal()) { + resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear(); + } + + for (var result : resultList) { + result.resultsFromDomain = domainCountFilter.getCount(result); + } + + return resultList; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java index 9b1ca5e1..bdb62571 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java @@ -36,15 +36,18 @@ public class ResultDomainDeduplicator { } public boolean test(EdgeSearchResultItem item) { - final int ranking = item.getRanking(); - if (ranking == Integer.MAX_VALUE) { + final long key = item.deduplicationKey(); + if (key == 0) return true; - } - - // For ResultItems, consider bucketId as well as different buckets may use different - // ranking algorithms - final long key = ranking*32L + item.bucketId; return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain; } + + public int getCount(EdgeSearchResultItem item) { + final long key = item.deduplicationKey(); + if (key == 0) + return 1; + + return resultsByRankingId.get(key); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java index c81bdafc..4db221b4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java @@ -16,6 +16,8 @@ public class EdgeSearchResultItem { public final List scores; + public int resultsFromDomain; + public EdgeSearchResultItem(int bucketId, long val) { this.bucketId = bucketId; this.combinedId = val; @@ -32,6 +34,7 @@ public class EdgeSearchResultItem { public int getRanking() { return (int)(combinedId >>> 32); } + public int getResultsFromDomain() { return resultsFromDomain; } /* Used for evaluation */ private transient double scoreValue = 1; @@ -56,4 +59,14 @@ public class EdgeSearchResultItem { } return false; } + + public long deduplicationKey() { + final int ranking = getRanking(); + + if (ranking == Integer.MAX_VALUE) { + return 0; + } + + return ranking*32L + bucketId; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java index fa4b6759..281be169 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java @@ -30,6 +30,12 @@ public class EdgeUrlDetails { public long rankingId; public double termScore; + public int resultsFromSameDomain; + + public boolean hasMoreResults() { + return resultsFromSameDomain > 1; + } + public long rankingIdAdjustment() { int penalty = 0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java index 6772c4cb..3a6e4ff4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java @@ -67,7 +67,12 @@ public class SiteListCommand implements SearchCommandInterface { resultSet = Collections.emptyList(); } - return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString()))); + return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, + "hideRanking", true, + "focusDomain", Objects.requireNonNullElse(domain, ""), + "profile", parameters.profileStr(), + "results", resultSet, "screenshot", + screenshotPath == null ? "" : screenshotPath.toString()))); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java index 80ec69b7..ea485b24 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java @@ -59,6 +59,7 @@ public class SearchResultDecorator { details.rankingId = rankingId; } + details.resultsFromSameDomain = resultItem.resultsFromDomain; details.termScore = calculateTermScore(resultItem, details); logger.debug("{} -> {}", details.url, details.termScore); diff --git a/marginalia_nu/src/main/resources/templates/edge/browse-result-rb.hdb b/marginalia_nu/src/main/resources/templates/edge/browse-result-rb.hdb deleted file mode 100644 index 0a0beb8f..00000000 --- a/marginalia_nu/src/main/resources/templates/edge/browse-result-rb.hdb +++ /dev/null @@ -1,12 +0,0 @@ -
-

{{url.domain}}

- - - - - - -
\ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb b/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb index 5d6ac50a..0c2e9fed 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb @@ -5,6 +5,6 @@ {{#if cookies}}👁️️{{/if}} {{#if ads}}⚠️️️{{/if}} {{format}} -{{#unless focusDomain}} +{{#unless hideRanking}} {{{rankingSymbol}}} {{/unless}} \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/search-result.hdb b/marginalia_nu/src/main/resources/templates/edge/search-result.hdb index 9a2b163a..4c6c9cf7 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-result.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/search-result.hdb @@ -6,8 +6,7 @@
Info - {{#unless focusDomain}}Search{{/unless}} - + {{#unless focusDomain}}{{#if hasMoreResults}}{{resultsFromSameDomain}}+{{/if}}{{/unless}}
{{>edge/search-result-metadata}}

diff --git a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb b/marginalia_nu/src/main/resources/templates/edge/search-results.hdb index 552cb58f..6a4b9cfc 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/search-results.hdb @@ -19,7 +19,6 @@
{{>edge/parts/search-form}} -{{#each domainResults}}{{>edge/browse-result-rb}}{{/each}}
{{#if maintenanceMessage}}

Maintenance

{{maintenanceMessage}}

{{/if}} @@ -41,6 +40,7 @@ {{#unless evalResult}}{{#if problems}}

Suggestions

    {{#each problems}}
  • {{{.}}}
  • {{/each}}
{{/if}}{{/unless}} + {{#each domainResults}}{{>edge/browse-result}}{{/each}} {{#each results}}{{>edge/search-result}}{{/each}}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java index 0bd22764..118229b1 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java @@ -38,6 +38,31 @@ class SentenceExtractorTest { legacySe.setLegacyMode(true); } + + public static void main(String... args) throws IOException { + final LanguageModels lm = TestLanguageModels.getLanguageModels(); + + var data = Path.of("/home/vlofgren/Code/tmp-data/"); + + System.out.println("Running"); + + SentenceExtractor se = new SentenceExtractor(lm); + + var dict = new TermFrequencyDict(lm); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); + for (;;) { + long total = 0; + for (var file : Objects.requireNonNull(data.toFile().listFiles())) { + var doc = Jsoup.parse(Files.readString(file.toPath())); + long start = System.currentTimeMillis(); + var dld = se.extractSentences(doc); + documentKeywordExtractor.extractKeywords(dld); + total += (System.currentTimeMillis() - start); + } + System.out.println(total); + } + } + @SneakyThrows @Test void testExtractSubject() { diff --git a/third_party/src/main/java/com/github/datquocnguyen/InitialTagger.java b/third_party/src/main/java/com/github/datquocnguyen/InitialTagger.java index 84819408..3fbcaa34 100644 --- a/third_party/src/main/java/com/github/datquocnguyen/InitialTagger.java +++ b/third_party/src/main/java/com/github/datquocnguyen/InitialTagger.java @@ -1,8 +1,6 @@ package com.github.datquocnguyen; import java.util.HashMap; -import java.util.function.Predicate; -import java.util.regex.Pattern; /** GPLv3 * @author DatQuocNguyen @@ -10,18 +8,106 @@ import java.util.regex.Pattern; */ public class InitialTagger { - private static final Pattern QUOTATION = Pattern.compile("(“)|(”)|(\")"); + static public boolean jj1(String s) { + int idx = s.indexOf('-'); + while (idx >= 0) { + if (idx > 0 && isDigit(s.charAt(idx-1))) + return true; + if (idx+1 < s.length() && isDigit(s.charAt(idx+1))) + return true; - private static final Predicate CD = Pattern.compile("[0-9]+").asPredicate(); - private static final Predicate URL = Pattern.compile("[A-Za-z]\\w*(\\.[A-Za-z]\\w+)+").asPredicate(); - private static final Predicate JJ1 = Pattern.compile("([0-9]+-)|(-[0-9]+)").asPredicate(); - private static final Predicate JJ2 = Pattern.compile("(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)").asPredicate(); - private static final Predicate JJ3 = Pattern.compile("(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)").asPredicate(); - private static final Predicate NN = Pattern.compile("(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)").asPredicate(); - private static final Predicate NNS = Pattern.compile(".*s$").asPredicate(); - private static final Predicate VBG = Pattern.compile(".*ing$").asPredicate(); - private static final Predicate VBN = Pattern.compile(".*ed$").asPredicate(); - private static final Predicate RB = Pattern.compile(".*ly$").asPredicate(); + idx = s.indexOf('-', idx+1); + } + return false; + } + + static public boolean nn(String s) { + if (s.endsWith("ness")) + return true; + if (s.endsWith("ment")) + return true; + if (s.endsWith("ship")) + return true; + if (s.startsWith("Ex")) + return true; + if (s.startsWith("ex")) + return true; + if (s.startsWith("Self-")) + return true; + if (s.startsWith("self-")) + return true; + + return false; + } + static public boolean jj2(String s) { + if (s.startsWith("Inter")) + return true; + if (s.startsWith("inter")) + return true; + if (s.startsWith("Dis")) + return true; + if (s.startsWith("dis")) + return true; + if (s.startsWith("Anti")) + return true; + if (s.startsWith("anti")) + return true; + + return false; + } + static public boolean jj3(String s) { + if (s.contains("-")) + return true; + if (s.endsWith("ful")) + return true; + if (s.endsWith("ous")) + return true; + if (s.endsWith("ble")) + return true; + if (s.endsWith("ic")) + return true; + if (s.endsWith("ive")) + return true; + if (s.endsWith("est")) + return true; + if (s.endsWith("able")) + return true; + if (s.endsWith("al")) + return true; + + return false; + } + static public boolean url(String s) { + int pointIdx = s.indexOf('.'); + return pointIdx >= 0 && pointIdx != s.length()-1; + } + static public boolean cd(String s) { + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (isDigit(c)) { + return true; + } + } + return false; + } + + public static boolean isDigit(char c) { + return c >= '0' && c <= '9'; + } + + static public boolean rb(String s) { + return s.endsWith("ly"); + } + static public boolean vbn(String s) { + return s.endsWith("vbn"); + } + static public boolean vbg(String s) { + return s.endsWith("vbg"); + } + + static public boolean nns(String s) { + return Character.isLowerCase(s.charAt(0)) && s.endsWith("s"); + } public static String[] EnInitTagger4Sentence( HashMap DICT, String[] sentence) @@ -35,9 +121,9 @@ public class InitialTagger } private static String getTagForWordEn(HashMap DICT, String word) { - if (QUOTATION.matcher(word).find()) { + if (word.contains("\"") || word.contains("“") || word.contains("”")) return DICT.get("''"); - } + if ("[]()<>!".contains(word)) { return "?"; } @@ -47,28 +133,27 @@ public class InitialTagger String lowerW = word.toLowerCase(); if (DICT.containsKey(lowerW)) return DICT.get(lowerW); - if (JJ1.test(word)) + if (jj1(word)) return "JJ"; - if (URL.test(word)) + if (url(word)) return "NN"; - if (CD.test(word)) + if (cd(word)) return "CD"; - if (NN.test(word)) + if (nn(word)) return "NN"; - if (NNS.test(word) - && Character.isLowerCase(word.charAt(0))) + if (nns(word)) return "NNS"; if (Character.isUpperCase(word.charAt(0))) return "NNP"; - if (JJ2.test(word)) + if (jj2(word)) return "JJ"; - if (VBG.test(word)) + if (vbg(word)) return "VBG"; - if (VBN.test(word)) + if (vbn(word)) return "VBN"; - if (word.contains("-") || JJ3.test(word)) + if (jj3(word)) return "JJ"; - if (RB.test(word)) + if (rb(word)) return "RB"; return "NN";