mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(keyword-extraction) Fix bug leading to position data missing on some keywords.
This was due to a discrepancy between the KeywordPositionBitmask and WordsTfIdfCounts' concept of a keyword.
This commit is contained in:
parent
9e185e80ce
commit
c68d17d482
@ -89,7 +89,11 @@ public class DocumentKeywordExtractor {
|
||||
var word = rep.word;
|
||||
|
||||
if (!word.isBlank()) {
|
||||
wordsBuilder.add(word, metadata.getMetadataForWord(rep.stemmed));
|
||||
long meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
|
||||
assert meta != 0L : "Missing meta for " + rep.word;
|
||||
|
||||
wordsBuilder.add(word, meta);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -112,14 +116,20 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
String w = word.wordLowerCase();
|
||||
if (matchesWordPattern(w)) {
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed()));
|
||||
long meta = metadata.getMetadataForWord(word.stemmed());
|
||||
assert meta != 0L : "Missing meta for " + word.word();
|
||||
|
||||
wordsBuilder.add(w, meta);
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
|
||||
wordsBuilder.add(rep.word, metadata.getMetadataForWord(rep.stemmed));
|
||||
long meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
assert meta != 0L : "Missing meta for " + rep.word;
|
||||
|
||||
wordsBuilder.add(rep.word, meta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -23,6 +23,10 @@ public class KeywordPositionBitmask {
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
@ -38,6 +42,10 @@ public class KeywordPositionBitmask {
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
if (highTfIdfInstances.contains(spanToStemmed(sent, span))) {
|
||||
if (highTfIdfInstances.contains(sent.constructStemmedWordFromSpan(span))) {
|
||||
tfIdfHigh.add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
@ -66,7 +66,7 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
counts.addTo(spanToStemmed(sent, span), 1);
|
||||
counts.addTo(sent.constructStemmedWordFromSpan(span), 1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -77,20 +77,6 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
return dict.getTermFreqStemmed(rep.stemmed);
|
||||
}
|
||||
|
||||
private String spanToStemmed(DocumentSentence sentence, WordSpan span) {
|
||||
if (span.size() == 1)
|
||||
return sentence.stemmedWords[span.start];
|
||||
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int i = span.start; i < span.end; i++) {
|
||||
if (!builder.isEmpty())
|
||||
builder.append('_');
|
||||
builder.append(sentence.stemmedWords[i]);
|
||||
}
|
||||
return builder.toString();
|
||||
|
||||
}
|
||||
|
||||
public int getTfIdf(String stemmed) {
|
||||
return tfIdf.getOrDefault(stemmed, 0);
|
||||
}
|
||||
|
@ -1,10 +1,5 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
import java.io.Serial;
|
||||
@ -23,6 +18,16 @@ public final class DocumentKeywords implements Serializable {
|
||||
{
|
||||
this.keywords = keywords;
|
||||
this.metadata = metadata;
|
||||
|
||||
assert keywords.length == metadata.length;
|
||||
|
||||
if (DocumentKeywords.class.desiredAssertionStatus()) {
|
||||
for (int i = 0; i < metadata.length; i++) {
|
||||
if (metadata[i] == 0) {
|
||||
System.err.println("Bad metadata for keyword " + keywords[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -36,6 +36,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:features-convert:adblock')
|
||||
implementation project(':code:features-convert:topic-detection')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
@ -1,10 +1,14 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
@ -18,6 +22,7 @@ import java.nio.file.Path;
|
||||
public class SentenceStatisticsExperiment extends Experiment {
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
Path filename;
|
||||
PrintWriter writer;
|
||||
|
||||
@ -32,6 +37,7 @@ public class SentenceStatisticsExperiment extends Experiment {
|
||||
private void logLine(String message) {
|
||||
System.out.printf("\u001b[2K\r%s", message);
|
||||
}
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean process(CrawledDomain domain) {
|
||||
if (domain.doc == null) return true;
|
||||
@ -46,17 +52,9 @@ public class SentenceStatisticsExperiment extends Experiment {
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
var dld = se.extractSentences(parsed);
|
||||
var keywords = documentKeywordExtractor.extractKeywords(dld, new EdgeUrl(doc.url));
|
||||
|
||||
|
||||
int numSentences = dld.sentences.length;
|
||||
if (numSentences == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
double avgLength = dld.totalNumWords() / (double) numSentences;
|
||||
if (avgLength < 5 && dld.totalNumWords() > 250) {
|
||||
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
|
||||
}
|
||||
keywords.build();
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -14,6 +14,7 @@ echo "args = $ARGS"
|
||||
JAVA_OPTS="
|
||||
-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR}
|
||||
-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||
-ea
|
||||
"
|
||||
|
||||
## Configuration ends
|
||||
|
Loading…
Reference in New Issue
Block a user