mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Remove count from WordMetadata entirely.
This commit is contained in:
parent
8fb531c614
commit
efb46cc703
@ -11,18 +11,9 @@ import static java.lang.Math.min;
|
||||
|
||||
public record WordMetadata(int tfIdf,
|
||||
int positions,
|
||||
int count,
|
||||
byte flags) {
|
||||
public WordMetadata {
|
||||
if (WordMetadata.class.desiredAssertionStatus()) {
|
||||
if (Integer.bitCount(positions) > count) {
|
||||
System.err.println(Integer.bitCount(positions) + ">" + count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static final long COUNT_MASK = 0xFL;
|
||||
public static final int COUNT_SHIFT = 8;
|
||||
// 8 unsused bits at the beginning
|
||||
|
||||
public static final long TF_IDF_MASK = 0xFFFFL;
|
||||
public static final int TF_IDF_SHIFT = 16;
|
||||
@ -41,17 +32,15 @@ public record WordMetadata(int tfIdf,
|
||||
this(
|
||||
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
|
||||
(int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
|
||||
Integer.bitCount((int) ((value >>> POSITIONS_SHIFT) & POSITIONS_MASK)),
|
||||
(byte) (value & FLAGS_MASK)
|
||||
);
|
||||
}
|
||||
|
||||
public WordMetadata(int tfIdf,
|
||||
int positions,
|
||||
int count,
|
||||
Set<EdgePageWordFlags> flags)
|
||||
{
|
||||
this(tfIdf, positions, count, encodeFlags(flags));
|
||||
this(tfIdf, positions, encodeFlags(flags));
|
||||
}
|
||||
|
||||
private static byte encodeFlags(Set<EdgePageWordFlags> flags) {
|
||||
@ -82,7 +71,6 @@ public record WordMetadata(int tfIdf,
|
||||
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
|
||||
sb.append('[')
|
||||
.append("tfidf=").append(tfIdf).append(", ")
|
||||
.append("count=").append(count).append(", ")
|
||||
.append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
|
||||
sb.append(", flags=").append(flags).append(']');
|
||||
return sb.toString();
|
||||
@ -95,14 +83,13 @@ public record WordMetadata(int tfIdf,
|
||||
|
||||
ret |= Byte.toUnsignedLong(flags);
|
||||
ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT;
|
||||
ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT;
|
||||
ret |= ((long)(positions)) << POSITIONS_SHIFT;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return count == 0 && positions == 0 && flags == 0 && tfIdf == 0;
|
||||
return positions == 0 && flags == 0 && tfIdf == 0;
|
||||
}
|
||||
|
||||
public static long emptyValue() {
|
||||
|
@ -12,16 +12,16 @@ class WordMetadataTest {
|
||||
|
||||
@Test
|
||||
public void codecTest() {
|
||||
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
|
||||
System.out.println(new WordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||
System.out.println(new WordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
|
||||
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(EdgePageWordFlags.class)));
|
||||
System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||
System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(EdgePageWordFlags.class)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClampTfIdfLow() {
|
||||
var original = new WordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
var encoded = new WordMetadata(original.encode());
|
||||
|
||||
assertEquals(original.positions(), encoded.positions());
|
||||
@ -30,32 +30,13 @@ class WordMetadataTest {
|
||||
|
||||
@Test
|
||||
public void testClampTfIdfHigh() {
|
||||
var original = new WordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
var encoded = new WordMetadata(original.encode());
|
||||
|
||||
assertEquals(original.positions(), encoded.positions());
|
||||
assertEquals(65535, encoded.tfIdf());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClampCountLow() {
|
||||
var original = new WordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
var encoded = new WordMetadata(original.encode());
|
||||
|
||||
assertEquals(original.positions(), encoded.positions());
|
||||
assertEquals(0, encoded.count());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClampCountHigh() {
|
||||
var original = new WordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
var encoded = new WordMetadata(original.encode());
|
||||
|
||||
assertEquals(original.positions(), encoded.positions());
|
||||
assertEquals(15, encoded.count());
|
||||
}
|
||||
|
||||
|
||||
public void verifyCodec(String message, WordMetadata data) {
|
||||
assertEquals(data, new WordMetadata(data.encode()), message);
|
||||
}
|
||||
|
@ -52,7 +52,7 @@ public class KeywordCounter {
|
||||
}
|
||||
}
|
||||
|
||||
HashMap<String, WordFrequencyData> tfIdf = keywordMetadata.wordsTfIdf();
|
||||
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf();
|
||||
List<WordRep> tfIdfHigh = new ArrayList<>();
|
||||
|
||||
int maxVal = maxValue(counts);
|
||||
@ -61,7 +61,7 @@ public class KeywordCounter {
|
||||
counts.forEach((key, cnt) -> {
|
||||
int value = getTermValue(key, cnt, maxVal);
|
||||
|
||||
tfIdf.put(key, new WordFrequencyData(cnt, value));
|
||||
tfIdf.put(key, value);
|
||||
|
||||
if (cnt > 1 && value > 100) {
|
||||
tfIdfHigh.addAll(instances.get(key));
|
||||
|
@ -81,12 +81,7 @@ public class SubjectCounter {
|
||||
return sum / parts.length;
|
||||
}
|
||||
|
||||
var meta = keywordMetadata.wordsTfIdf().get(stemmed);
|
||||
if (meta != null) {
|
||||
return meta.tfIdfNormalized();
|
||||
}
|
||||
|
||||
return 0;
|
||||
return keywordMetadata.wordsTfIdf().getOrDefault(stemmed, 0);
|
||||
}
|
||||
|
||||
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||
|
@ -11,17 +11,17 @@ import java.util.Objects;
|
||||
|
||||
public final class KeywordMetadata {
|
||||
|
||||
private static final WordFrequencyData empty = new WordFrequencyData(0, 0);
|
||||
private static final WordFrequencyData empty = new WordFrequencyData(0);
|
||||
private final HashSet<String> titleKeywords = new HashSet<>(50);
|
||||
private final HashSet<String> subjectKeywords = new HashSet<>(10);
|
||||
private final HashSet<String> namesKeywords = new HashSet<>(50);
|
||||
private final HashMap<String, WordFrequencyData> wordsTfIdf;
|
||||
private final Object2IntOpenHashMap<String> wordsTfIdf;
|
||||
private final Object2IntOpenHashMap<String> positionMask;
|
||||
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
|
||||
|
||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||
this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
this.wordsTfIdf = new HashMap<>(10_000);
|
||||
this.wordsTfIdf = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
this.wordFlagsTemplate = flags;
|
||||
}
|
||||
|
||||
@ -31,7 +31,7 @@ public final class KeywordMetadata {
|
||||
|
||||
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||
|
||||
WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
||||
int tfidf = wordsTfIdf.getOrDefault(stemmed, 0);
|
||||
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
||||
|
||||
if (subjectKeywords.contains(stemmed))
|
||||
@ -44,9 +44,8 @@ public final class KeywordMetadata {
|
||||
flags.add(EdgePageWordFlags.Title);
|
||||
|
||||
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||
int count = Math.max(Integer.bitCount(positions), tfidf.count());
|
||||
|
||||
return new WordMetadata(tfidf.tfIdfNormalized(), positions, count, flags).encode();
|
||||
return new WordMetadata(tfidf, positions, flags).encode();
|
||||
}
|
||||
|
||||
public HashSet<String> titleKeywords() {
|
||||
@ -61,7 +60,7 @@ public final class KeywordMetadata {
|
||||
return namesKeywords;
|
||||
}
|
||||
|
||||
public HashMap<String, WordFrequencyData> wordsTfIdf() {
|
||||
public Object2IntOpenHashMap<String> wordsTfIdf() {
|
||||
return wordsTfIdf;
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
|
||||
public record WordFrequencyData(int count, int tfIdfNormalized) { }
|
||||
public record WordFrequencyData(int tfIdfNormalized) { }
|
@ -169,7 +169,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
long[] data = new long[factors.length*2];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
||||
data[2*i + 1] = new WordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
||||
data[2*i + 1] = new WordMetadata(i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
||||
}
|
||||
|
||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||
@ -182,7 +182,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
long[] data = new long[factors.length*2];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
||||
data[2*i + 1] = new WordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
||||
data[2*i + 1] = new WordMetadata(i % 20, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
||||
}
|
||||
|
||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||
|
@ -73,7 +73,7 @@ public class SearchApiQueryService {
|
||||
continue outer;
|
||||
|
||||
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags));
|
||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
|
||||
}
|
||||
details.add(lst);
|
||||
}
|
||||
|
@ -104,7 +104,7 @@ public class SearchResultValuator {
|
||||
double sum = 0.;
|
||||
|
||||
for (var keyword : keywordSet) {
|
||||
double count = Math.min(255, keyword.count());
|
||||
double count = Math.min(255, Integer.bitCount(keyword.wordMetadata().positions()));
|
||||
double wt = keyword.weight() * keyword.weight() / keywordSet.length();
|
||||
|
||||
final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5));
|
||||
@ -313,9 +313,7 @@ public class SearchResultValuator {
|
||||
public int tfIdf() {
|
||||
return wordMetadata.tfIdf();
|
||||
}
|
||||
public int count() {
|
||||
return wordMetadata.count();
|
||||
}
|
||||
|
||||
public EnumSet<EdgePageWordFlags> flags() {
|
||||
return wordMetadata.flagSet();
|
||||
}
|
||||
|
@ -85,7 +85,7 @@ class SearchResultValuatorTest {
|
||||
.reduce((a,b) -> a|b)
|
||||
.orElse(0);
|
||||
|
||||
return new WordMetadata(tfIdf, posBits, positions.size(), wordFlags).encode();
|
||||
return new WordMetadata(tfIdf, posBits, wordFlags).encode();
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user