mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(keyword-extractor) Retire TfIdfHigh WordFlag
This will bring the word flags count down to 8, and let us pack every value in a byte.
This commit is contained in:
parent
0d227f3543
commit
d36055a2d0
@ -4,16 +4,12 @@ package nu.marginalia.model.idx;
|
||||
import java.util.EnumSet;
|
||||
|
||||
public enum WordFlags {
|
||||
|
||||
/** Word appears in title */
|
||||
Title,
|
||||
|
||||
/** Word appears to be the subject in several sentences */
|
||||
Subjects,
|
||||
|
||||
/** Word has high tf-idf */
|
||||
TfIdfHigh,
|
||||
|
||||
/** Word is a likely named object. This is a weaker version of Subjects. */
|
||||
NamesWords,
|
||||
|
||||
|
@ -1,16 +1,16 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
|
||||
@ -44,7 +44,6 @@ public class DocumentKeywordExtractor {
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.tfIdfCounts(tfIdfCounts)
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
.subjectLikeKeywords(subjectLikeKeywords)
|
||||
@ -55,7 +54,6 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
createSimpleWords(wordsBuilder, keywordMetadata, dld);
|
||||
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
@ -69,7 +67,7 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {
|
||||
return Stream.of(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords)
|
||||
return Stream.of(nameLikeKeywords, subjectLikeKeywords)
|
||||
.flatMap(k -> k.getReps().stream())
|
||||
.filter(w -> {
|
||||
if (w.word.length() < 3)
|
||||
|
@ -1,7 +1,10 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.keyword.extractors.NameLikeKeywords;
|
||||
import nu.marginalia.keyword.extractors.SubjectLikeKeywords;
|
||||
import nu.marginalia.keyword.extractors.TitleKeywords;
|
||||
import nu.marginalia.keyword.extractors.UrlKeywords;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
class KeywordMetadata {
|
||||
@ -10,32 +13,24 @@ class KeywordMetadata {
|
||||
private final NameLikeKeywords nameLikeKeywords;
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
private final UrlKeywords urlKeywords;
|
||||
private final WordsTfIdfCounts tfIdfCounts;
|
||||
|
||||
@Builder
|
||||
public KeywordMetadata(
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords,
|
||||
WordsTfIdfCounts tfIdfCounts)
|
||||
UrlKeywords urlKeywords)
|
||||
{
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
this.urlKeywords = urlKeywords;
|
||||
this.tfIdfCounts = tfIdfCounts;
|
||||
}
|
||||
|
||||
public long getMetadataForWord(String stemmed) {
|
||||
|
||||
int tfidf = tfIdfCounts.getTfIdf(stemmed);
|
||||
long flags = 0;
|
||||
|
||||
if (tfidf > 100) {
|
||||
flags |= WordFlags.TfIdfHigh.asBit();
|
||||
}
|
||||
|
||||
if (subjectLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Subjects.asBit();
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.converting.processor.logic.links;
|
||||
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@ -13,7 +13,7 @@ public class TopKeywords {
|
||||
if (doc.details == null || doc.details.linksInternal == null)
|
||||
return;
|
||||
|
||||
List<String> topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.TfIdfHigh.asBit() | WordFlags.Subjects.asBit());
|
||||
List<String> topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.Subjects.asBit());
|
||||
|
||||
topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords));
|
||||
}
|
||||
|
@ -6,18 +6,14 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.ProcessConfigurationModule;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
@ -25,7 +21,9 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
||||
import nu.marginalia.mqapi.index.CreateIndexRequest;
|
||||
import nu.marginalia.mqapi.index.IndexName;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -36,7 +34,6 @@ import java.sql.SQLException;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.LongPredicate;
|
||||
|
||||
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
|
||||
|
||||
@ -138,36 +135,16 @@ public class IndexConstructorMain extends ProcessMainClass {
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path tmpDir = workDir.resolve("tmp");
|
||||
|
||||
// The priority index only includes words that have bits indicating they are
|
||||
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
||||
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
||||
|
||||
var constructor = new PrioIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
(path) -> IndexJournalReader.singleFile(path).filtering(wordMetaFilter),
|
||||
(path) -> IndexJournalReader.singleFile(path).filtering(r -> r != 0),
|
||||
this::addRankToIdEncoding,
|
||||
tmpDir);
|
||||
|
||||
constructor.createReverseIndex(heartbeat, "createReverseIndexPrio", workDir);
|
||||
}
|
||||
|
||||
private static LongPredicate getPriorityIndexWordMetaFilter() {
|
||||
|
||||
long highPriorityFlags =
|
||||
WordFlags.Title.asBit()
|
||||
| WordFlags.Subjects.asBit()
|
||||
| WordFlags.TfIdfHigh.asBit()
|
||||
| WordFlags.NamesWords.asBit()
|
||||
| WordFlags.UrlDomain.asBit()
|
||||
| WordFlags.UrlPath.asBit()
|
||||
| WordFlags.Site.asBit()
|
||||
| WordFlags.ExternalLink.asBit()
|
||||
| WordFlags.SiteAdjacent.asBit();
|
||||
|
||||
return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags);
|
||||
}
|
||||
|
||||
private void createForwardIndex() throws IOException {
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
|
@ -37,14 +37,14 @@ import nu.marginalia.loading.links.DomainLinksLoaderService;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.test.IntegrationTestModule;
|
||||
import nu.marginalia.test.TestUtil;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -52,7 +52,6 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.List;
|
||||
import java.util.function.LongPredicate;
|
||||
|
||||
import static nu.marginalia.index.journal.reader.IndexJournalReader.FILE_HEADER_SIZE_BYTES;
|
||||
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
|
||||
@ -265,36 +264,16 @@ public class IntegrationTest {
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path tmpDir = workDir.resolve("tmp");
|
||||
|
||||
// The priority index only includes words that have bits indicating they are
|
||||
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
||||
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
||||
|
||||
var constructor = new PrioIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
(path) -> IndexJournalReader.singleFile(path).filtering(wordMetaFilter),
|
||||
(path) -> IndexJournalReader.singleFile(path).filtering(r -> r != 0),
|
||||
this::addRankToIdEncoding,
|
||||
tmpDir);
|
||||
|
||||
constructor.createReverseIndex(new FakeProcessHeartbeat(), "createReverseIndexPrio", workDir);
|
||||
}
|
||||
|
||||
private static LongPredicate getPriorityIndexWordMetaFilter() {
|
||||
|
||||
long highPriorityFlags =
|
||||
WordFlags.Title.asBit()
|
||||
| WordFlags.Subjects.asBit()
|
||||
| WordFlags.TfIdfHigh.asBit()
|
||||
| WordFlags.NamesWords.asBit()
|
||||
| WordFlags.UrlDomain.asBit()
|
||||
| WordFlags.UrlPath.asBit()
|
||||
| WordFlags.Site.asBit()
|
||||
| WordFlags.ExternalLink.asBit()
|
||||
| WordFlags.SiteAdjacent.asBit();
|
||||
|
||||
return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags);
|
||||
}
|
||||
|
||||
private void createForwardIndex() throws IOException {
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
|
Loading…
Reference in New Issue
Block a user