(keyword-extractor) Retire TfIdfHigh WordFlag

This will bring the word flags count down to 8, and let us pack every value in a byte.
This commit is contained in:
Viktor Lofgren 2024-07-17 13:54:39 +02:00
parent 0d227f3543
commit d36055a2d0
6 changed files with 20 additions and 75 deletions

View File

@ -4,16 +4,12 @@ package nu.marginalia.model.idx;
import java.util.EnumSet;
public enum WordFlags {
/** Word appears in title */
Title,
/** Word appears to be the subject in several sentences */
Subjects,
/** Word has high tf-idf */
TfIdfHigh,
/** Word is a likely named object. This is a weaker version of Subjects. */
NamesWords,

View File

@ -1,16 +1,16 @@
package nu.marginalia.keyword;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import com.google.inject.Inject;
import java.util.*;
import java.util.Collection;
import java.util.Comparator;
import java.util.stream.Stream;
@ -44,7 +44,6 @@ public class DocumentKeywordExtractor {
var urlKeywords = new UrlKeywords(url);
var keywordMetadata = KeywordMetadata.builder()
.tfIdfCounts(tfIdfCounts)
.titleKeywords(titleKeywords)
.nameLikeKeywords(nameLikeKeywords)
.subjectLikeKeywords(subjectLikeKeywords)
@ -55,7 +54,6 @@ public class DocumentKeywordExtractor {
createSimpleWords(wordsBuilder, keywordMetadata, dld);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
@ -69,7 +67,7 @@ public class DocumentKeywordExtractor {
}
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {
return Stream.of(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords)
return Stream.of(nameLikeKeywords, subjectLikeKeywords)
.flatMap(k -> k.getReps().stream())
.filter(w -> {
if (w.word.length() < 3)

View File

@ -1,7 +1,10 @@
package nu.marginalia.keyword;
import lombok.Builder;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.extractors.NameLikeKeywords;
import nu.marginalia.keyword.extractors.SubjectLikeKeywords;
import nu.marginalia.keyword.extractors.TitleKeywords;
import nu.marginalia.keyword.extractors.UrlKeywords;
import nu.marginalia.model.idx.WordFlags;
class KeywordMetadata {
@ -10,32 +13,24 @@ class KeywordMetadata {
private final NameLikeKeywords nameLikeKeywords;
private final SubjectLikeKeywords subjectLikeKeywords;
private final UrlKeywords urlKeywords;
private final WordsTfIdfCounts tfIdfCounts;
@Builder
public KeywordMetadata(
TitleKeywords titleKeywords,
NameLikeKeywords nameLikeKeywords,
SubjectLikeKeywords subjectLikeKeywords,
UrlKeywords urlKeywords,
WordsTfIdfCounts tfIdfCounts)
UrlKeywords urlKeywords)
{
this.titleKeywords = titleKeywords;
this.nameLikeKeywords = nameLikeKeywords;
this.subjectLikeKeywords = subjectLikeKeywords;
this.urlKeywords = urlKeywords;
this.tfIdfCounts = tfIdfCounts;
}
public long getMetadataForWord(String stemmed) {
int tfidf = tfIdfCounts.getTfIdf(stemmed);
long flags = 0;
if (tfidf > 100) {
flags |= WordFlags.TfIdfHigh.asBit();
}
if (subjectLikeKeywords.contains(stemmed)) {
flags |= WordFlags.Subjects.asBit();
}

View File

@ -1,8 +1,8 @@
package nu.marginalia.converting.processor.logic.links;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordFlags;
import java.util.*;
@ -13,7 +13,7 @@ public class TopKeywords {
if (doc.details == null || doc.details.linksInternal == null)
return;
List<String> topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.TfIdfHigh.asBit() | WordFlags.Subjects.asBit());
List<String> topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.Subjects.asBit());
topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords));
}

View File

@ -6,18 +6,14 @@ import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.MqMessage;
import nu.marginalia.mq.inbox.MqInboxResponse;
@ -25,7 +21,9 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox;
import nu.marginalia.mqapi.index.CreateIndexRequest;
import nu.marginalia.mqapi.index.IndexName;
import nu.marginalia.process.control.ProcessHeartbeatImpl;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.storage.FileStorageService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -36,7 +34,6 @@ import java.sql.SQLException;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import java.util.function.LongPredicate;
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
@ -138,36 +135,16 @@ public class IndexConstructorMain extends ProcessMainClass {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
// The priority index only includes words that have bits indicating they are
// important to the document. This filter will act on the encoded {@see WordMetadata}
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
var constructor = new PrioIndexConstructor(
outputFileDocs,
outputFileWords,
(path) -> IndexJournalReader.singleFile(path).filtering(wordMetaFilter),
(path) -> IndexJournalReader.singleFile(path).filtering(r -> r != 0),
this::addRankToIdEncoding,
tmpDir);
constructor.createReverseIndex(heartbeat, "createReverseIndexPrio", workDir);
}
private static LongPredicate getPriorityIndexWordMetaFilter() {
long highPriorityFlags =
WordFlags.Title.asBit()
| WordFlags.Subjects.asBit()
| WordFlags.TfIdfHigh.asBit()
| WordFlags.NamesWords.asBit()
| WordFlags.UrlDomain.asBit()
| WordFlags.UrlPath.asBit()
| WordFlags.Site.asBit()
| WordFlags.ExternalLink.asBit()
| WordFlags.SiteAdjacent.asBit();
return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags);
}
private void createForwardIndex() throws IOException {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);

View File

@ -37,14 +37,14 @@ import nu.marginalia.loading.links.DomainLinksLoaderService;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.test.IntegrationTestModule;
import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
@ -52,7 +52,6 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.List;
import java.util.function.LongPredicate;
import static nu.marginalia.index.journal.reader.IndexJournalReader.FILE_HEADER_SIZE_BYTES;
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
@ -265,36 +264,16 @@ public class IntegrationTest {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
// The priority index only includes words that have bits indicating they are
// important to the document. This filter will act on the encoded {@see WordMetadata}
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
var constructor = new PrioIndexConstructor(
outputFileDocs,
outputFileWords,
(path) -> IndexJournalReader.singleFile(path).filtering(wordMetaFilter),
(path) -> IndexJournalReader.singleFile(path).filtering(r -> r != 0),
this::addRankToIdEncoding,
tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "createReverseIndexPrio", workDir);
}
private static LongPredicate getPriorityIndexWordMetaFilter() {
long highPriorityFlags =
WordFlags.Title.asBit()
| WordFlags.Subjects.asBit()
| WordFlags.TfIdfHigh.asBit()
| WordFlags.NamesWords.asBit()
| WordFlags.UrlDomain.asBit()
| WordFlags.UrlPath.asBit()
| WordFlags.Site.asBit()
| WordFlags.ExternalLink.asBit()
| WordFlags.SiteAdjacent.asBit();
return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags);
}
private void createForwardIndex() throws IOException {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);