(keyword-extraction) Fix bug leading to position data missing on some keywords.

This was due to a discrepancy between the KeywordPositionBitmask and WordsTfIdfCounts' concept of a keyword.
This commit is contained in:
Viktor Lofgren 2023-09-02 14:48:12 +02:00
parent 9e185e80ce
commit c68d17d482
7 changed files with 43 additions and 34 deletions

View File

@ -89,7 +89,11 @@ public class DocumentKeywordExtractor {
var word = rep.word;
if (!word.isBlank()) {
wordsBuilder.add(word, metadata.getMetadataForWord(rep.stemmed));
long meta = metadata.getMetadataForWord(rep.stemmed);
assert meta != 0L : "Missing meta for " + rep.word;
wordsBuilder.add(word, meta);
}
}
}
@ -112,14 +116,20 @@ public class DocumentKeywordExtractor {
String w = word.wordLowerCase();
if (matchesWordPattern(w)) {
wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed()));
long meta = metadata.getMetadataForWord(word.stemmed());
assert meta != 0L : "Missing meta for " + word.word();
wordsBuilder.add(w, meta);
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
wordsBuilder.add(rep.word, metadata.getMetadataForWord(rep.stemmed));
long meta = metadata.getMetadataForWord(rep.stemmed);
assert meta != 0L : "Missing meta for " + rep.word;
wordsBuilder.add(rep.word, meta);
}
}
}

View File

@ -23,6 +23,10 @@ public class KeywordPositionBitmask {
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
@ -38,6 +42,10 @@ public class KeywordPositionBitmask {
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}

View File

@ -51,7 +51,7 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) {
if (highTfIdfInstances.contains(spanToStemmed(sent, span))) {
if (highTfIdfInstances.contains(sent.constructStemmedWordFromSpan(span))) {
tfIdfHigh.add(new WordRep(sent, span));
}
}
@ -66,7 +66,7 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) {
counts.addTo(spanToStemmed(sent, span), 1);
counts.addTo(sent.constructStemmedWordFromSpan(span), 1);
}
}
@ -77,20 +77,6 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
return dict.getTermFreqStemmed(rep.stemmed);
}
private String spanToStemmed(DocumentSentence sentence, WordSpan span) {
if (span.size() == 1)
return sentence.stemmedWords[span.start];
StringBuilder builder = new StringBuilder();
for (int i = span.start; i < span.end; i++) {
if (!builder.isEmpty())
builder.append('_');
builder.append(sentence.stemmedWords[i]);
}
return builder.toString();
}
public int getTfIdf(String stemmed) {
return tfIdf.getOrDefault(stemmed, 0);
}

View File

@ -1,10 +1,5 @@
package nu.marginalia.keyword.model;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import nu.marginalia.model.idx.WordMetadata;
import java.io.Serial;
@ -23,6 +18,16 @@ public final class DocumentKeywords implements Serializable {
{
this.keywords = keywords;
this.metadata = metadata;
assert keywords.length == metadata.length;
if (DocumentKeywords.class.desiredAssertionStatus()) {
for (int i = 0; i < metadata.length; i++) {
if (metadata[i] == 0) {
System.err.println("Bad metadata for keyword " + keywords[i]);
}
}
}
}
@Override

View File

@ -36,6 +36,7 @@ dependencies {
implementation project(':code:features-convert:adblock')
implementation project(':code:features-convert:topic-detection')
implementation project(':code:features-convert:keyword-extraction')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -1,10 +1,14 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.tools.Experiment;
import org.jsoup.Jsoup;
@ -18,6 +22,7 @@ import java.nio.file.Path;
public class SentenceStatisticsExperiment extends Experiment {
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
Path filename;
PrintWriter writer;
@ -32,6 +37,7 @@ public class SentenceStatisticsExperiment extends Experiment {
private void logLine(String message) {
System.out.printf("\u001b[2K\r%s", message);
}
@SneakyThrows
@Override
public boolean process(CrawledDomain domain) {
if (domain.doc == null) return true;
@ -46,17 +52,9 @@ public class SentenceStatisticsExperiment extends Experiment {
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);
var keywords = documentKeywordExtractor.extractKeywords(dld, new EdgeUrl(doc.url));
int numSentences = dld.sentences.length;
if (numSentences == 0) {
continue;
}
double avgLength = dld.totalNumWords() / (double) numSentences;
if (avgLength < 5 && dld.totalNumWords() > 250) {
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
}
keywords.build();
}
return true;

View File

@ -14,6 +14,7 @@ echo "args = $ARGS"
JAVA_OPTS="
-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR}
-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true
-ea
"
## Configuration ends