Remove count from WordMetadata entirely.

This commit is contained in:
Viktor Lofgren 2023-03-09 18:14:14 +01:00
parent 8fb531c614
commit efb46cc703
10 changed files with 26 additions and 66 deletions

View File

@ -11,18 +11,9 @@ import static java.lang.Math.min;
public record WordMetadata(int tfIdf,
int positions,
int count,
byte flags) {
public WordMetadata {
if (WordMetadata.class.desiredAssertionStatus()) {
if (Integer.bitCount(positions) > count) {
System.err.println(Integer.bitCount(positions) + ">" + count);
}
}
}
public static final long COUNT_MASK = 0xFL;
public static final int COUNT_SHIFT = 8;
// 8 unsused bits at the beginning
public static final long TF_IDF_MASK = 0xFFFFL;
public static final int TF_IDF_SHIFT = 16;
@ -41,17 +32,15 @@ public record WordMetadata(int tfIdf,
this(
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
(int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
Integer.bitCount((int) ((value >>> POSITIONS_SHIFT) & POSITIONS_MASK)),
(byte) (value & FLAGS_MASK)
);
}
public WordMetadata(int tfIdf,
int positions,
int count,
Set<EdgePageWordFlags> flags)
{
this(tfIdf, positions, count, encodeFlags(flags));
this(tfIdf, positions, encodeFlags(flags));
}
private static byte encodeFlags(Set<EdgePageWordFlags> flags) {
@ -82,7 +71,6 @@ public record WordMetadata(int tfIdf,
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
sb.append('[')
.append("tfidf=").append(tfIdf).append(", ")
.append("count=").append(count).append(", ")
.append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
sb.append(", flags=").append(flags).append(']');
return sb.toString();
@ -95,14 +83,13 @@ public record WordMetadata(int tfIdf,
ret |= Byte.toUnsignedLong(flags);
ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT;
ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT;
ret |= ((long)(positions)) << POSITIONS_SHIFT;
return ret;
}
public boolean isEmpty() {
return count == 0 && positions == 0 && flags == 0 && tfIdf == 0;
return positions == 0 && flags == 0 && tfIdf == 0;
}
public static long emptyValue() {

View File

@ -12,16 +12,16 @@ class WordMetadataTest {
@Test
public void codecTest() {
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
System.out.println(new WordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class)));
System.out.println(new WordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(EdgePageWordFlags.class)));
System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(EdgePageWordFlags.class)));
System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(EdgePageWordFlags.class)));
}
@Test
public void testClampTfIdfLow() {
var original = new WordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
@ -30,32 +30,13 @@ class WordMetadataTest {
@Test
public void testClampTfIdfHigh() {
var original = new WordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(65535, encoded.tfIdf());
}
@Test
public void testClampCountLow() {
var original = new WordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(0, encoded.count());
}
@Test
public void testClampCountHigh() {
var original = new WordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(15, encoded.count());
}
public void verifyCodec(String message, WordMetadata data) {
assertEquals(data, new WordMetadata(data.encode()), message);
}

View File

@ -52,7 +52,7 @@ public class KeywordCounter {
}
}
HashMap<String, WordFrequencyData> tfIdf = keywordMetadata.wordsTfIdf();
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf();
List<WordRep> tfIdfHigh = new ArrayList<>();
int maxVal = maxValue(counts);
@ -61,7 +61,7 @@ public class KeywordCounter {
counts.forEach((key, cnt) -> {
int value = getTermValue(key, cnt, maxVal);
tfIdf.put(key, new WordFrequencyData(cnt, value));
tfIdf.put(key, value);
if (cnt > 1 && value > 100) {
tfIdfHigh.addAll(instances.get(key));

View File

@ -81,12 +81,7 @@ public class SubjectCounter {
return sum / parts.length;
}
var meta = keywordMetadata.wordsTfIdf().get(stemmed);
if (meta != null) {
return meta.tfIdfNormalized();
}
return 0;
return keywordMetadata.wordsTfIdf().getOrDefault(stemmed, 0);
}
private boolean isDetOrAdverbOrVerb(String posTag) {

View File

@ -11,17 +11,17 @@ import java.util.Objects;
public final class KeywordMetadata {
private static final WordFrequencyData empty = new WordFrequencyData(0, 0);
private static final WordFrequencyData empty = new WordFrequencyData(0);
private final HashSet<String> titleKeywords = new HashSet<>(50);
private final HashSet<String> subjectKeywords = new HashSet<>(10);
private final HashSet<String> namesKeywords = new HashSet<>(50);
private final HashMap<String, WordFrequencyData> wordsTfIdf;
private final Object2IntOpenHashMap<String> wordsTfIdf;
private final Object2IntOpenHashMap<String> positionMask;
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
this.wordsTfIdf = new HashMap<>(10_000);
this.wordsTfIdf = new Object2IntOpenHashMap<>(10_000, 0.7f);
this.wordFlagsTemplate = flags;
}
@ -31,7 +31,7 @@ public final class KeywordMetadata {
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
int tfidf = wordsTfIdf.getOrDefault(stemmed, 0);
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
if (subjectKeywords.contains(stemmed))
@ -44,9 +44,8 @@ public final class KeywordMetadata {
flags.add(EdgePageWordFlags.Title);
int positions = positionMask.getOrDefault(stemmed, 0);
int count = Math.max(Integer.bitCount(positions), tfidf.count());
return new WordMetadata(tfidf.tfIdfNormalized(), positions, count, flags).encode();
return new WordMetadata(tfidf, positions, flags).encode();
}
public HashSet<String> titleKeywords() {
@ -61,7 +60,7 @@ public final class KeywordMetadata {
return namesKeywords;
}
public HashMap<String, WordFrequencyData> wordsTfIdf() {
public Object2IntOpenHashMap<String> wordsTfIdf() {
return wordsTfIdf;
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.language.model;
public record WordFrequencyData(int count, int tfIdfNormalized) { }
public record WordFrequencyData(int tfIdfNormalized) { }

View File

@ -169,7 +169,7 @@ public class IndexQueryServiceIntegrationTest {
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
data[2*i + 1] = new WordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
data[2*i + 1] = new WordMetadata(i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
}
indexJournalWriter.put(header, new IndexJournalEntryData(data));
@ -182,7 +182,7 @@ public class IndexQueryServiceIntegrationTest {
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
data[2*i + 1] = new WordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
data[2*i + 1] = new WordMetadata(i % 20, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
}
indexJournalWriter.put(header, new IndexJournalEntryData(data));

View File

@ -73,7 +73,7 @@ public class SearchApiQueryService {
continue outer;
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags));
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
}
details.add(lst);
}

View File

@ -104,7 +104,7 @@ public class SearchResultValuator {
double sum = 0.;
for (var keyword : keywordSet) {
double count = Math.min(255, keyword.count());
double count = Math.min(255, Integer.bitCount(keyword.wordMetadata().positions()));
double wt = keyword.weight() * keyword.weight() / keywordSet.length();
final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5));
@ -313,9 +313,7 @@ public class SearchResultValuator {
public int tfIdf() {
return wordMetadata.tfIdf();
}
public int count() {
return wordMetadata.count();
}
public EnumSet<EdgePageWordFlags> flags() {
return wordMetadata.flagSet();
}

View File

@ -85,7 +85,7 @@ class SearchResultValuatorTest {
.reduce((a,b) -> a|b)
.orElse(0);
return new WordMetadata(tfIdf, posBits, positions.size(), wordFlags).encode();
return new WordMetadata(tfIdf, posBits, wordFlags).encode();
}
}