diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index b3b098cf..8feb5fd8 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -89,7 +89,11 @@ public class DocumentKeywordExtractor { var word = rep.word; if (!word.isBlank()) { - wordsBuilder.add(word, metadata.getMetadataForWord(rep.stemmed)); + long meta = metadata.getMetadataForWord(rep.stemmed); + + assert meta != 0L : "Missing meta for " + rep.word; + + wordsBuilder.add(word, meta); } } } @@ -112,14 +116,20 @@ public class DocumentKeywordExtractor { String w = word.wordLowerCase(); if (matchesWordPattern(w)) { - wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed())); + long meta = metadata.getMetadataForWord(word.stemmed()); + assert meta != 0L : "Missing meta for " + word.word(); + + wordsBuilder.add(w, meta); } } for (var names : keywordExtractor.getProperNames(sent)) { var rep = new WordRep(sent, names); - wordsBuilder.add(rep.word, metadata.getMetadataForWord(rep.stemmed)); + long meta = metadata.getMetadataForWord(rep.stemmed); + assert meta != 0L : "Missing meta for " + rep.word; + + wordsBuilder.add(rep.word, meta); } } } diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java index b9274730..5e91b12c 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java @@ -23,6 +23,10 @@ public class KeywordPositionBitmask { positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); } + for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { + positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); + } + for (var span : keywordExtractor.getProperNames(sent)) { positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } @@ -38,6 +42,10 @@ public class KeywordPositionBitmask { positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); } + for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { + positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); + } + for (var span : keywordExtractor.getProperNames(sent)) { positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java index 64f50dde..559f69a6 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java @@ -51,7 +51,7 @@ public class WordsTfIdfCounts implements WordReps, Comparator { for (var sent : dld.sentences) { var keywords = keywordExtractor.getKeywordsFromSentence(sent); for (var span : keywords) { - if (highTfIdfInstances.contains(spanToStemmed(sent, span))) { + if (highTfIdfInstances.contains(sent.constructStemmedWordFromSpan(span))) { tfIdfHigh.add(new WordRep(sent, span)); } } @@ -66,7 +66,7 @@ public class WordsTfIdfCounts implements WordReps, Comparator { for (var sent : dld.sentences) { var keywords = keywordExtractor.getKeywordsFromSentence(sent); for (var span : keywords) { - counts.addTo(spanToStemmed(sent, span), 1); + counts.addTo(sent.constructStemmedWordFromSpan(span), 1); } } @@ -77,20 +77,6 @@ public class WordsTfIdfCounts implements WordReps, Comparator { return dict.getTermFreqStemmed(rep.stemmed); } - private String spanToStemmed(DocumentSentence sentence, WordSpan span) { - if (span.size() == 1) - return sentence.stemmedWords[span.start]; - - StringBuilder builder = new StringBuilder(); - for (int i = span.start; i < span.end; i++) { - if (!builder.isEmpty()) - builder.append('_'); - builder.append(sentence.stemmedWords[i]); - } - return builder.toString(); - - } - public int getTfIdf(String stemmed) { return tfIdf.getOrDefault(stemmed, 0); } diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java index d22e0240..f8ad86d7 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java @@ -1,10 +1,5 @@ package nu.marginalia.keyword.model; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; import nu.marginalia.model.idx.WordMetadata; import java.io.Serial; @@ -23,6 +18,16 @@ public final class DocumentKeywords implements Serializable { { this.keywords = keywords; this.metadata = metadata; + + assert keywords.length == metadata.length; + + if (DocumentKeywords.class.desiredAssertionStatus()) { + for (int i = 0; i < metadata.length; i++) { + if (metadata[i] == 0) { + System.err.println("Bad metadata for keyword " + keywords[i]); + } + } + } } @Override diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index 3552326a..77d84e21 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -36,6 +36,7 @@ dependencies { implementation project(':code:features-convert:adblock') implementation project(':code:features-convert:topic-detection') + implementation project(':code:features-convert:keyword-extraction') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index 44f3cf18..554ba4fe 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -1,10 +1,14 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; +import lombok.SneakyThrows; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.tools.Experiment; import org.jsoup.Jsoup; @@ -18,6 +22,7 @@ import java.nio.file.Path; public class SentenceStatisticsExperiment extends Experiment { SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); Path filename; PrintWriter writer; @@ -32,6 +37,7 @@ public class SentenceStatisticsExperiment extends Experiment { private void logLine(String message) { System.out.printf("\u001b[2K\r%s", message); } + @SneakyThrows @Override public boolean process(CrawledDomain domain) { if (domain.doc == null) return true; @@ -46,17 +52,9 @@ public class SentenceStatisticsExperiment extends Experiment { parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); + var keywords = documentKeywordExtractor.extractKeywords(dld, new EdgeUrl(doc.url)); - - int numSentences = dld.sentences.length; - if (numSentences == 0) { - continue; - } - - double avgLength = dld.totalNumWords() / (double) numSentences; - if (avgLength < 5 && dld.totalNumWords() > 250) { - writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength); - } + keywords.build(); } return true; diff --git a/run/experiment.sh b/run/experiment.sh index 4343c0e0..f8651bf5 100755 --- a/run/experiment.sh +++ b/run/experiment.sh @@ -14,6 +14,7 @@ echo "args = $ARGS" JAVA_OPTS=" -Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR} -Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true +-ea " ## Configuration ends