diff --git a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java index db54df77..f9016c48 100644 --- a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java +++ b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java @@ -4,16 +4,12 @@ package nu.marginalia.model.idx; import java.util.EnumSet; public enum WordFlags { - /** Word appears in title */ Title, /** Word appears to be the subject in several sentences */ Subjects, - /** Word has high tf-idf */ - TfIdfHigh, - /** Word is a likely named object. This is a weaker version of Subjects. */ NamesWords, diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index 61fbc0dd..facb601f 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -1,16 +1,16 @@ package nu.marginalia.keyword; +import com.google.inject.Inject; import nu.marginalia.WmsaHome; import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import com.google.inject.Inject; - -import java.util.*; +import java.util.Collection; +import java.util.Comparator; import java.util.stream.Stream; @@ -44,7 +44,6 @@ public class DocumentKeywordExtractor { var urlKeywords = new UrlKeywords(url); var keywordMetadata = KeywordMetadata.builder() - .tfIdfCounts(tfIdfCounts) .titleKeywords(titleKeywords) .nameLikeKeywords(nameLikeKeywords) .subjectLikeKeywords(subjectLikeKeywords) @@ -55,7 +54,6 @@ public class DocumentKeywordExtractor { createSimpleWords(wordsBuilder, keywordMetadata, dld); - createNGramTermsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts); createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords); createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords); createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords); @@ -69,7 +67,7 @@ public class DocumentKeywordExtractor { } private static Collection getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) { - return Stream.of(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords) + return Stream.of(nameLikeKeywords, subjectLikeKeywords) .flatMap(k -> k.getReps().stream()) .filter(w -> { if (w.word.length() < 3) diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java index 4394936b..0bf5043a 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java @@ -1,7 +1,10 @@ package nu.marginalia.keyword; import lombok.Builder; -import nu.marginalia.keyword.extractors.*; +import nu.marginalia.keyword.extractors.NameLikeKeywords; +import nu.marginalia.keyword.extractors.SubjectLikeKeywords; +import nu.marginalia.keyword.extractors.TitleKeywords; +import nu.marginalia.keyword.extractors.UrlKeywords; import nu.marginalia.model.idx.WordFlags; class KeywordMetadata { @@ -10,32 +13,24 @@ class KeywordMetadata { private final NameLikeKeywords nameLikeKeywords; private final SubjectLikeKeywords subjectLikeKeywords; private final UrlKeywords urlKeywords; - private final WordsTfIdfCounts tfIdfCounts; @Builder public KeywordMetadata( TitleKeywords titleKeywords, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, - UrlKeywords urlKeywords, - WordsTfIdfCounts tfIdfCounts) + UrlKeywords urlKeywords) { this.titleKeywords = titleKeywords; this.nameLikeKeywords = nameLikeKeywords; this.subjectLikeKeywords = subjectLikeKeywords; this.urlKeywords = urlKeywords; - this.tfIdfCounts = tfIdfCounts; } public long getMetadataForWord(String stemmed) { - int tfidf = tfIdfCounts.getTfIdf(stemmed); long flags = 0; - if (tfidf > 100) { - flags |= WordFlags.TfIdfHigh.asBit(); - } - if (subjectLikeKeywords.contains(stemmed)) { flags |= WordFlags.Subjects.asBit(); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/TopKeywords.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/TopKeywords.java index 89043750..4c646dd3 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/TopKeywords.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/TopKeywords.java @@ -1,8 +1,8 @@ package nu.marginalia.converting.processor.logic.links; -import nu.marginalia.model.idx.WordFlags; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.idx.WordFlags; import java.util.*; @@ -13,7 +13,7 @@ public class TopKeywords { if (doc.details == null || doc.details.linksInternal == null) return; - List topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.TfIdfHigh.asBit() | WordFlags.Subjects.asBit()); + List topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.Subjects.asBit()); topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords)); } diff --git a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java index 34cd0738..4f7e9d90 100644 --- a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java +++ b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java @@ -6,18 +6,14 @@ import com.google.inject.Inject; import nu.marginalia.IndexLocations; import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfigurationModule; +import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.service.ProcessMainClass; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; @@ -25,7 +21,9 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.mqapi.index.CreateIndexRequest; import nu.marginalia.mqapi.index.IndexName; import nu.marginalia.process.control.ProcessHeartbeatImpl; +import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.storage.FileStorageService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,7 +34,6 @@ import java.sql.SQLException; import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; -import java.util.function.LongPredicate; import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX; @@ -138,36 +135,16 @@ public class IndexConstructorMain extends ProcessMainClass { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path tmpDir = workDir.resolve("tmp"); - // The priority index only includes words that have bits indicating they are - // important to the document. This filter will act on the encoded {@see WordMetadata} - LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter(); - var constructor = new PrioIndexConstructor( outputFileDocs, outputFileWords, - (path) -> IndexJournalReader.singleFile(path).filtering(wordMetaFilter), + (path) -> IndexJournalReader.singleFile(path).filtering(r -> r != 0), this::addRankToIdEncoding, tmpDir); constructor.createReverseIndex(heartbeat, "createReverseIndexPrio", workDir); } - private static LongPredicate getPriorityIndexWordMetaFilter() { - - long highPriorityFlags = - WordFlags.Title.asBit() - | WordFlags.Subjects.asBit() - | WordFlags.TfIdfHigh.asBit() - | WordFlags.NamesWords.asBit() - | WordFlags.UrlDomain.asBit() - | WordFlags.UrlPath.asBit() - | WordFlags.Site.asBit() - | WordFlags.ExternalLink.asBit() - | WordFlags.SiteAdjacent.asBit(); - - return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags); - } - private void createForwardIndex() throws IOException { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); diff --git a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java index ca6ab9cc..7f75409d 100644 --- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java +++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java @@ -37,14 +37,14 @@ import nu.marginalia.loading.links.DomainLinksLoaderService; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.test.IntegrationTestModule; import nu.marginalia.test.TestUtil; -import org.junit.jupiter.api.*; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import org.mockito.Mockito; import java.io.IOException; @@ -52,7 +52,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.List; -import java.util.function.LongPredicate; import static nu.marginalia.index.journal.reader.IndexJournalReader.FILE_HEADER_SIZE_BYTES; import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; @@ -265,36 +264,16 @@ public class IntegrationTest { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path tmpDir = workDir.resolve("tmp"); - // The priority index only includes words that have bits indicating they are - // important to the document. This filter will act on the encoded {@see WordMetadata} - LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter(); - var constructor = new PrioIndexConstructor( outputFileDocs, outputFileWords, - (path) -> IndexJournalReader.singleFile(path).filtering(wordMetaFilter), + (path) -> IndexJournalReader.singleFile(path).filtering(r -> r != 0), this::addRankToIdEncoding, tmpDir); constructor.createReverseIndex(new FakeProcessHeartbeat(), "createReverseIndexPrio", workDir); } - private static LongPredicate getPriorityIndexWordMetaFilter() { - - long highPriorityFlags = - WordFlags.Title.asBit() - | WordFlags.Subjects.asBit() - | WordFlags.TfIdfHigh.asBit() - | WordFlags.NamesWords.asBit() - | WordFlags.UrlDomain.asBit() - | WordFlags.UrlPath.asBit() - | WordFlags.Site.asBit() - | WordFlags.ExternalLink.asBit() - | WordFlags.SiteAdjacent.asBit(); - - return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags); - } - private void createForwardIndex() throws IOException { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);