Merge pull request 'UX improvements for "show more results".' (#116) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/116
This commit is contained in:
Viktor Lofgren 2022-09-15 15:57:15 +02:00
commit ac7c1772de
21 changed files with 362 additions and 102 deletions

View File

@ -28,11 +28,11 @@ import java.nio.file.Path;
import java.time.Duration;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
@Tag("e2e")
@Testcontainers
@ -188,7 +188,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
}
@ -201,7 +201,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertNotEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
assertEquals(Collections.emptyList(), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js"));
}
@ -214,7 +214,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js"));
}

View File

@ -31,7 +31,7 @@ public class WordPatterns {
public static final Set<String> topWords;
static {
topWords = new HashSet<>(200);
topWords = new HashSet<>(200, 0.25f);
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-stopwords"),
"Could not load word frequency table");
var br = new BufferedReader(new InputStreamReader(resource))
@ -87,11 +87,33 @@ public class WordPatterns {
return true;
}
public static boolean hasWordQualities(String s) {
int start = 0;
int end = s.length();
if (s.charAt(0) == '#') start++;
if (end > 1 && s.charAt(end-1) == '#') end--;
for (int i = start; i < end; i++) {
char c = s.charAt(i);
if (!("_@.'+-".indexOf(c) >= 0)
&& !(c >= 'a' && c <= 'z')
&& !(c >= 'A' && c <= 'Z')
&& !(c >= '0' && c <= '9')
&& !(c >= '\u00C0' && c <= '\u00D6')
&& !(c >= '\u00D8' && c <= '\u00f6')
&& !(c >= '\u00f8' && c <= '\u00ff')) {
return false;
}
}
return true;
}
public static boolean isStopWord(String s) {
if (s.length() < MIN_WORD_LENGTH) {
return true;
}
if (!wordQualitiesPredicate.test(s)) {
if (!hasWordQualities(s)) {
return true;
}
if (!filter(s)) {

View File

@ -1,16 +1,23 @@
package nu.marginalia.util.language.processing;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class AsciiFlattener {
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+");
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$");
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
private static boolean isPlainAscii(String s) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if ((c & 0x80) != 0) {
return false;
}
}
return true;
}
public static String flattenUnicode(String s) {
if (plainAscii.test(s)) {
if (isPlainAscii(s)) {
return s;
}

View File

@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import javax.inject.Inject;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class DocumentKeywordExtractor {
@ -156,13 +157,16 @@ public class DocumentKeywordExtractor {
}
}
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
Set<String> reps = new HashSet<>();
for (var sent : documentLanguageData.sentences) {
for (var word : sent) {
String lc = word.wordLowerCase();
if (lc.matches("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+")) {
if (lc.length() > 6
&& lc.indexOf('@') > 0
&& mailLikePattern.matcher(lc).matches()) {
reps.add(lc);
String domain = lc.substring(lc.indexOf('@'));
@ -189,6 +193,6 @@ public class DocumentKeywordExtractor {
}
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns::hasWordQualities).collect(Collectors.toSet()));
}
}

View File

@ -23,8 +23,8 @@ public class KeywordCounter {
}
public WordHistogram countHisto(DocumentLanguageData dld) {
HashMap<String, Integer> counts = new HashMap<>(1000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
HashMap<String, Integer> counts = new HashMap<>(15000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
for (var sent : dld.sentences) {
@ -37,15 +37,15 @@ public class KeywordCounter {
String stemmed = sent.constructStemmedWordFromSpan(span);
counts.merge(stemmed, 1, Integer::sum);
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
instances.computeIfAbsent(stemmed, k -> new HashSet<>(500)).add(new WordRep(sent, span));
}
}
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
Set<WordRep> h5 = new HashSet<>();
Set<WordRep> h10 = new HashSet<>();
Set<WordRep> h15 = new HashSet<>();
Set<WordRep> h5 = new HashSet<>(2500);
Set<WordRep> h10 = new HashSet<>(500);
Set<WordRep> h15 = new HashSet<>(500);
int doubleWordCount = 0;
@ -65,13 +65,14 @@ public class KeywordCounter {
histogram.addAll(instances.get(wordStemmed));
}
return new WordHistogram(h5, h10, h15);
}
private static final Pattern separator = Pattern.compile("_");
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
String key = e.getKey();
if (key.contains("_")) {
String[] parts = separator.split(e.getKey());
double totalValue = 0.;
for (String part : parts) {
@ -79,6 +80,10 @@ public class KeywordCounter {
}
return totalValue / parts.length;
}
else {
return value(key, e.getValue(), maxValue);
}
}
double value(String key, double value, double maxValue) {
double freq = dict.getTermFreqStemmed(key);

View File

@ -6,10 +6,10 @@ import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.SneakyThrows;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import nu.marginalia.util.language.conf.LanguageModels;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer;
@ -127,8 +127,9 @@ public class SentenceExtractor {
private static final Pattern dotPattern = Pattern.compile("\\.+$");
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
private static final Pattern spacesPattern = Pattern.compile("\\s+");
private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
private static final Pattern possessivePattern = Pattern.compile("'(s)?$");
public DocumentSentence extractSentence(String text) {
var wordsAndSeps = splitSegment(text);
@ -142,10 +143,20 @@ public class SentenceExtractor {
);
}
public String normalizeSpaces(String s) {
if (s.indexOf('\t') >= 0) {
s = s.replace('\t', ' ');
}
if (s.indexOf('\n') >= 0) {
s = s.replace('\n', ' ');
}
return s;
}
public DocumentSentence[] extractSentencesFromString(String text) {
String[] sentences;
String textNormalizedSpaces = text.replaceAll("\\s", " ");
String textNormalizedSpaces = normalizeSpaces(text);
try {
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
}
@ -157,10 +168,17 @@ public class SentenceExtractor {
sentences = Arrays.copyOf(sentences, 250);
}
sentences = Arrays.stream(sentences)
.filter(s -> !s.isBlank())
.flatMap(s -> Arrays.stream(splitPattern.split(s)))
.toArray(String[]::new);
List<String> sentenceList = new ArrayList<>();
for (var s : sentences) {
if (s.isBlank()) continue;
if (s.contains("-") || s.contains("|")) {
sentenceList.addAll(Arrays.asList(splitPattern.split(s)));
}
else {
sentenceList.add(s);
}
}
sentences = sentenceList.toArray(String[]::new);
final String[][] tokens = new String[sentences.length][];
final int[][] separators = new int[sentences.length][];
@ -178,7 +196,9 @@ public class SentenceExtractor {
separators[i] = Arrays.copyOf(separators[i], 250);
}
for (int j = 0; j < tokens[i].length; j++) {
tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll( "");
if (tokens[i][j].endsWith(".")) {
tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll("");
}
}
}
@ -204,7 +224,7 @@ public class SentenceExtractor {
private String[] stemSentence(String[] strings) {
String[] stemmed = new String[strings.length];
for (int i = 0; i < stemmed.length; i++) {
var sent = possessivePattern.matcher(strings[i]).replaceAll("");
var sent = cleanPossessive(strings[i]);
try {
stemmed[i] = porterStemmer.stem(sent);
}
@ -215,10 +235,23 @@ public class SentenceExtractor {
return stemmed;
}
private String cleanPossessive(String s) {
int end = s.length();
if (s.endsWith("\'")) {
return s.substring(0, end-1);
} else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) {
return s.substring(0, end-2).toLowerCase();
}
else {
return s;
}
}
private String[] toLc(String[] words) {
String[] lower = new String[words.length];
for (int i = 0; i < lower.length; i++) {
lower[i] = possessivePattern.matcher(words[i].toLowerCase()).replaceAll("");
lower[i] = cleanPossessive(words[i]).toLowerCase();
}
return lower;
}

View File

@ -8,7 +8,6 @@ import java.lang.ref.SoftReference;
import java.util.BitSet;
import java.util.Iterator;
import java.util.StringJoiner;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@ -53,27 +52,71 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
return words.length;
}
private final static Pattern trailingJunkPattern = Pattern.compile("(^[\"'_*]+|[_*'\"]+$)");
private final static Pattern joinerPattern = Pattern.compile("[-+.]+");
private String removeJunk(String s) {
int start = 0;
int end = s.length();
for (; start < end; start++) {
if ("\"'_*".indexOf(s.charAt(start)) < 0)
break;
}
for (; end > start; end--) {
if ("\"'_*".indexOf(s.charAt(end-1)) < 0)
break;
}
if (start > 0 || end < s.length()) {
return s.substring(start, end);
}
else {
return s;
}
}
public String constructWordFromSpan(WordSpan span) {
if (span.size() == 1) {
return removeJunk(wordsLowerCase[span.start]);
}
else {
StringJoiner sj = new StringJoiner("_");
for (int i = span.start; i < span.end; i++) {
sj.add(wordsLowerCase[i]);
}
return trailingJunkPattern.matcher(sj.toString()).replaceAll("");
return removeJunk(sj.toString());
}
}
private String normalizeJoiner(String s) {
if (s.indexOf('+') >= 0) {
s = s.replace('+', '_');
}
if (s.indexOf('.') >= 0) {
s = s.replace('.', '_');
}
if (s.indexOf('-') >= 0) {
s = s.replace('-', '_');
}
return s;
}
public String constructStemmedWordFromSpan(WordSpan span) {
if (span.size() > 1) {
StringJoiner sj = new StringJoiner("_");
for (int i = span.start; i < span.end; i++) {
if (includeInStemming(i))
sj.add(joinerPattern.matcher(stemmedWords[i]).replaceAll("_"));
sj.add(normalizeJoiner(stemmedWords[i]));
}
return sj.toString();
}
else if (includeInStemming(span.start)) {
return normalizeJoiner(stemmedWords[span.start]);
}
else return "";
}
private boolean includeInStemming(int i) {
if (posTags[i].equals("IN") || posTags[i].equals("TO") || posTags[i].equals("CC") || posTags[i].equals("DT")) {

View File

@ -5,16 +5,21 @@ import lombok.EqualsAndHashCode;
import lombok.Getter;
import org.jetbrains.annotations.NotNull;
import java.util.Objects;
@AllArgsConstructor @EqualsAndHashCode @Getter
public class WordRep implements Comparable<WordRep> {
public WordRep(DocumentSentence sent, WordSpan span) {
word = sent.constructWordFromSpan(span);
stemmed = sent.constructStemmedWordFromSpan(span);
length = span.end - span.start;
hashCode = Objects.hash(word);
}
public final int length;
public final String word;
public final String stemmed;
private final int hashCode;
@Override
public int compareTo(@NotNull WordRep o) {
@ -25,4 +30,8 @@ public class WordRep implements Comparable<WordRep> {
public String toString() {
return word;
}
public int hashCode() {
return hashCode;
}
}

View File

@ -112,7 +112,8 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
rsp.getInt(11), // dataHash
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
Integer.MAX_VALUE, // rankingId
Double.MAX_VALUE // termScore
Double.MAX_VALUE, // termScore
1 // resultsFromSameDomain
);
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
&& Strings.isNullOrEmpty(val.description)

View File

@ -32,6 +32,7 @@ import spark.Spark;
import java.util.*;
import java.util.function.LongPredicate;
import java.util.stream.Collectors;
import static java.util.Comparator.comparing;
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
@ -184,14 +185,24 @@ public class EdgeIndexQueryService {
}
cachePool.clear();
return results.stream()
List<EdgeSearchResultItem> resultList = results.stream()
.sorted(
comparing(EdgeSearchResultItem::getScore)
.thenComparing(EdgeSearchResultItem::getRanking)
.thenComparing(EdgeSearchResultItem::getUrlIdInt)
)
.filter(domainCountFilter::test)
.limit(specsSet.getLimitTotal()).toList();
.collect(Collectors.toList());
if (resultList.size() > specsSet.getLimitTotal()) {
resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear();
}
for (var result : resultList) {
result.resultsFromDomain = domainCountFilter.getCount(result);
}
return resultList;
}

View File

@ -36,15 +36,18 @@ public class ResultDomainDeduplicator {
}
public boolean test(EdgeSearchResultItem item) {
final int ranking = item.getRanking();
if (ranking == Integer.MAX_VALUE) {
final long key = item.deduplicationKey();
if (key == 0)
return true;
}
// For ResultItems, consider bucketId as well as different buckets may use different
// ranking algorithms
final long key = ranking*32L + item.bucketId;
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
}
public int getCount(EdgeSearchResultItem item) {
final long key = item.deduplicationKey();
if (key == 0)
return 1;
return resultsByRankingId.get(key);
}
}

View File

@ -16,6 +16,8 @@ public class EdgeSearchResultItem {
public final List<EdgeSearchResultKeywordScore> scores;
public int resultsFromDomain;
public EdgeSearchResultItem(int bucketId, long val) {
this.bucketId = bucketId;
this.combinedId = val;
@ -32,6 +34,7 @@ public class EdgeSearchResultItem {
public int getRanking() {
return (int)(combinedId >>> 32);
}
public int getResultsFromDomain() { return resultsFromDomain; }
/* Used for evaluation */
private transient double scoreValue = 1;
@ -56,4 +59,14 @@ public class EdgeSearchResultItem {
}
return false;
}
public long deduplicationKey() {
final int ranking = getRanking();
if (ranking == Integer.MAX_VALUE) {
return 0;
}
return ranking*32L + bucketId;
}
}

View File

@ -30,6 +30,12 @@ public class EdgeUrlDetails {
public long rankingId;
public double termScore;
public int resultsFromSameDomain;
public boolean hasMoreResults() {
return resultsFromSameDomain > 1;
}
public long rankingIdAdjustment() {
int penalty = 0;

View File

@ -67,7 +67,12 @@ public class SiteListCommand implements SearchCommandInterface {
resultSet = Collections.emptyList();
}
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString())));
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query,
"hideRanking", true,
"focusDomain", Objects.requireNonNullElse(domain, ""),
"profile", parameters.profileStr(),
"results", resultSet, "screenshot",
screenshotPath == null ? "" : screenshotPath.toString())));
}

View File

@ -59,6 +59,7 @@ public class SearchResultDecorator {
details.rankingId = rankingId;
}
details.resultsFromSameDomain = resultItem.resultsFromDomain;
details.termScore = calculateTermScore(resultItem, details);
logger.debug("{} -> {}", details.url, details.termScore);

View File

@ -1,12 +0,0 @@
<section class="card browse-result rightbox">
<h2>{{url.domain}}</h2>
<a href="{{url.proto}}://{{url.domain}}/">
<img src="/screenshot/{{domainId}}" title="{{description}}" loading="lazy"/>
</a>
<div class="utils">
<a href="/site/{{url.domain}}">Info</a>
<a href="/explore/{{url.domain}}">Similar Domains</a>
</div>
</section>

View File

@ -5,6 +5,6 @@
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
{{#if ads}}<abbr title="possible annoying ads or popovers (experimental)" class="meta">⚠️️️</abbr>{{/if}}
<span class="meta">{{format}}</span>
{{#unless focusDomain}}
{{#unless hideRanking}}
<span class="rank-symbol" title="{{rankingSymbolDesc}}">{{{rankingSymbol}}}</span>
{{/unless}}

View File

@ -6,8 +6,7 @@
<div class="utils">
<a href="/site/{{url.domain}}" title="Domain Information">Info</a>
{{#unless focusDomain}}<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="Narrow the search to this domain">Search</a>{{/unless}}
{{#unless focusDomain}}{{#if hasMoreResults}}<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="More results from this domain">{{resultsFromSameDomain}}+</a>{{/if}}{{/unless}}
<div class="meta">{{>edge/search-result-metadata}}</div>
</div>
<hr class="w3m-helper" />

View File

@ -19,7 +19,6 @@
<article>
{{>edge/parts/search-form}}
{{#each domainResults}}{{>edge/browse-result-rb}}{{/each}}
<section class="cards">
{{#if maintenanceMessage}}<section class="card problems onlyscreen"><h2>Maintenance</h2><p class="description">{{maintenanceMessage}}</p></section>{{/if}}
@ -41,6 +40,7 @@
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}
{{#each domainResults}}{{>edge/browse-result}}{{/each}}
{{#each results}}{{>edge/search-result}}{{/each}}
</section>

View File

@ -38,6 +38,31 @@ class SentenceExtractorTest {
legacySe.setLegacyMode(true);
}
public static void main(String... args) throws IOException {
final LanguageModels lm = TestLanguageModels.getLanguageModels();
var data = Path.of("/home/vlofgren/Code/tmp-data/");
System.out.println("Running");
SentenceExtractor se = new SentenceExtractor(lm);
var dict = new TermFrequencyDict(lm);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
for (;;) {
long total = 0;
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
var doc = Jsoup.parse(Files.readString(file.toPath()));
long start = System.currentTimeMillis();
var dld = se.extractSentences(doc);
documentKeywordExtractor.extractKeywords(dld);
total += (System.currentTimeMillis() - start);
}
System.out.println(total);
}
}
@SneakyThrows
@Test
void testExtractSubject() {

View File

@ -1,8 +1,6 @@
package com.github.datquocnguyen;
import java.util.HashMap;
import java.util.function.Predicate;
import java.util.regex.Pattern;
/** GPLv3
* @author DatQuocNguyen
@ -10,18 +8,106 @@ import java.util.regex.Pattern;
*/
public class InitialTagger
{
private static final Pattern QUOTATION = Pattern.compile("(“)|(”)|(\")");
static public boolean jj1(String s) {
int idx = s.indexOf('-');
while (idx >= 0) {
if (idx > 0 && isDigit(s.charAt(idx-1)))
return true;
if (idx+1 < s.length() && isDigit(s.charAt(idx+1)))
return true;
private static final Predicate<String> CD = Pattern.compile("[0-9]+").asPredicate();
private static final Predicate<String> URL = Pattern.compile("[A-Za-z]\\w*(\\.[A-Za-z]\\w+)+").asPredicate();
private static final Predicate<String> JJ1 = Pattern.compile("([0-9]+-)|(-[0-9]+)").asPredicate();
private static final Predicate<String> JJ2 = Pattern.compile("(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)").asPredicate();
private static final Predicate<String> JJ3 = Pattern.compile("(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)").asPredicate();
private static final Predicate<String> NN = Pattern.compile("(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)").asPredicate();
private static final Predicate<String> NNS = Pattern.compile(".*s$").asPredicate();
private static final Predicate<String> VBG = Pattern.compile(".*ing$").asPredicate();
private static final Predicate<String> VBN = Pattern.compile(".*ed$").asPredicate();
private static final Predicate<String> RB = Pattern.compile(".*ly$").asPredicate();
idx = s.indexOf('-', idx+1);
}
return false;
}
static public boolean nn(String s) {
if (s.endsWith("ness"))
return true;
if (s.endsWith("ment"))
return true;
if (s.endsWith("ship"))
return true;
if (s.startsWith("Ex"))
return true;
if (s.startsWith("ex"))
return true;
if (s.startsWith("Self-"))
return true;
if (s.startsWith("self-"))
return true;
return false;
}
static public boolean jj2(String s) {
if (s.startsWith("Inter"))
return true;
if (s.startsWith("inter"))
return true;
if (s.startsWith("Dis"))
return true;
if (s.startsWith("dis"))
return true;
if (s.startsWith("Anti"))
return true;
if (s.startsWith("anti"))
return true;
return false;
}
static public boolean jj3(String s) {
if (s.contains("-"))
return true;
if (s.endsWith("ful"))
return true;
if (s.endsWith("ous"))
return true;
if (s.endsWith("ble"))
return true;
if (s.endsWith("ic"))
return true;
if (s.endsWith("ive"))
return true;
if (s.endsWith("est"))
return true;
if (s.endsWith("able"))
return true;
if (s.endsWith("al"))
return true;
return false;
}
static public boolean url(String s) {
int pointIdx = s.indexOf('.');
return pointIdx >= 0 && pointIdx != s.length()-1;
}
static public boolean cd(String s) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (isDigit(c)) {
return true;
}
}
return false;
}
public static boolean isDigit(char c) {
return c >= '0' && c <= '9';
}
static public boolean rb(String s) {
return s.endsWith("ly");
}
static public boolean vbn(String s) {
return s.endsWith("vbn");
}
static public boolean vbg(String s) {
return s.endsWith("vbg");
}
static public boolean nns(String s) {
return Character.isLowerCase(s.charAt(0)) && s.endsWith("s");
}
public static String[] EnInitTagger4Sentence(
HashMap<String, String> DICT, String[] sentence)
@ -35,9 +121,9 @@ public class InitialTagger
}
private static String getTagForWordEn(HashMap<String, String> DICT, String word) {
if (QUOTATION.matcher(word).find()) {
if (word.contains("\"") || word.contains("") || word.contains(""))
return DICT.get("''");
}
if ("[]()<>!".contains(word)) {
return "?";
}
@ -47,28 +133,27 @@ public class InitialTagger
String lowerW = word.toLowerCase();
if (DICT.containsKey(lowerW))
return DICT.get(lowerW);
if (JJ1.test(word))
if (jj1(word))
return "JJ";
if (URL.test(word))
if (url(word))
return "NN";
if (CD.test(word))
if (cd(word))
return "CD";
if (NN.test(word))
if (nn(word))
return "NN";
if (NNS.test(word)
&& Character.isLowerCase(word.charAt(0)))
if (nns(word))
return "NNS";
if (Character.isUpperCase(word.charAt(0)))
return "NNP";
if (JJ2.test(word))
if (jj2(word))
return "JJ";
if (VBG.test(word))
if (vbg(word))
return "VBG";
if (VBN.test(word))
if (vbn(word))
return "VBN";
if (word.contains("-") || JJ3.test(word))
if (jj3(word))
return "JJ";
if (RB.test(word))
if (rb(word))
return "RB";
return "NN";