Remove count from WordMetadata entirely.

This commit is contained in:
Viktor Lofgren 2023-03-09 18:14:14 +01:00
parent 8fb531c614
commit efb46cc703
10 changed files with 26 additions and 66 deletions

View File

@ -11,18 +11,9 @@ import static java.lang.Math.min;
public record WordMetadata(int tfIdf, public record WordMetadata(int tfIdf,
int positions, int positions,
int count,
byte flags) { byte flags) {
public WordMetadata {
if (WordMetadata.class.desiredAssertionStatus()) {
if (Integer.bitCount(positions) > count) {
System.err.println(Integer.bitCount(positions) + ">" + count);
}
}
}
public static final long COUNT_MASK = 0xFL; // 8 unsused bits at the beginning
public static final int COUNT_SHIFT = 8;
public static final long TF_IDF_MASK = 0xFFFFL; public static final long TF_IDF_MASK = 0xFFFFL;
public static final int TF_IDF_SHIFT = 16; public static final int TF_IDF_SHIFT = 16;
@ -41,17 +32,15 @@ public record WordMetadata(int tfIdf,
this( this(
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK), (int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
(int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK), (int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
Integer.bitCount((int) ((value >>> POSITIONS_SHIFT) & POSITIONS_MASK)),
(byte) (value & FLAGS_MASK) (byte) (value & FLAGS_MASK)
); );
} }
public WordMetadata(int tfIdf, public WordMetadata(int tfIdf,
int positions, int positions,
int count,
Set<EdgePageWordFlags> flags) Set<EdgePageWordFlags> flags)
{ {
this(tfIdf, positions, count, encodeFlags(flags)); this(tfIdf, positions, encodeFlags(flags));
} }
private static byte encodeFlags(Set<EdgePageWordFlags> flags) { private static byte encodeFlags(Set<EdgePageWordFlags> flags) {
@ -82,7 +71,6 @@ public record WordMetadata(int tfIdf,
StringBuilder sb = new StringBuilder(getClass().getSimpleName()); StringBuilder sb = new StringBuilder(getClass().getSimpleName());
sb.append('[') sb.append('[')
.append("tfidf=").append(tfIdf).append(", ") .append("tfidf=").append(tfIdf).append(", ")
.append("count=").append(count).append(", ")
.append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']'); .append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
sb.append(", flags=").append(flags).append(']'); sb.append(", flags=").append(flags).append(']');
return sb.toString(); return sb.toString();
@ -95,14 +83,13 @@ public record WordMetadata(int tfIdf,
ret |= Byte.toUnsignedLong(flags); ret |= Byte.toUnsignedLong(flags);
ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT; ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT;
ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT;
ret |= ((long)(positions)) << POSITIONS_SHIFT; ret |= ((long)(positions)) << POSITIONS_SHIFT;
return ret; return ret;
} }
public boolean isEmpty() { public boolean isEmpty() {
return count == 0 && positions == 0 && flags == 0 && tfIdf == 0; return positions == 0 && flags == 0 && tfIdf == 0;
} }
public static long emptyValue() { public static long emptyValue() {

View File

@ -12,16 +12,16 @@ class WordMetadataTest {
@Test @Test
public void codecTest() { public void codecTest() {
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class))); verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class))); verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class))); verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(EdgePageWordFlags.class)));
System.out.println(new WordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class))); System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(EdgePageWordFlags.class)));
System.out.println(new WordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class))); System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(EdgePageWordFlags.class)));
} }
@Test @Test
public void testClampTfIdfLow() { public void testClampTfIdfLow() {
var original = new WordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class)); var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode()); var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions()); assertEquals(original.positions(), encoded.positions());
@ -30,32 +30,13 @@ class WordMetadataTest {
@Test @Test
public void testClampTfIdfHigh() { public void testClampTfIdfHigh() {
var original = new WordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class)); var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode()); var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions()); assertEquals(original.positions(), encoded.positions());
assertEquals(65535, encoded.tfIdf()); assertEquals(65535, encoded.tfIdf());
} }
@Test
public void testClampCountLow() {
var original = new WordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(0, encoded.count());
}
@Test
public void testClampCountHigh() {
var original = new WordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(15, encoded.count());
}
public void verifyCodec(String message, WordMetadata data) { public void verifyCodec(String message, WordMetadata data) {
assertEquals(data, new WordMetadata(data.encode()), message); assertEquals(data, new WordMetadata(data.encode()), message);
} }

View File

@ -52,7 +52,7 @@ public class KeywordCounter {
} }
} }
HashMap<String, WordFrequencyData> tfIdf = keywordMetadata.wordsTfIdf(); Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf();
List<WordRep> tfIdfHigh = new ArrayList<>(); List<WordRep> tfIdfHigh = new ArrayList<>();
int maxVal = maxValue(counts); int maxVal = maxValue(counts);
@ -61,7 +61,7 @@ public class KeywordCounter {
counts.forEach((key, cnt) -> { counts.forEach((key, cnt) -> {
int value = getTermValue(key, cnt, maxVal); int value = getTermValue(key, cnt, maxVal);
tfIdf.put(key, new WordFrequencyData(cnt, value)); tfIdf.put(key, value);
if (cnt > 1 && value > 100) { if (cnt > 1 && value > 100) {
tfIdfHigh.addAll(instances.get(key)); tfIdfHigh.addAll(instances.get(key));

View File

@ -81,12 +81,7 @@ public class SubjectCounter {
return sum / parts.length; return sum / parts.length;
} }
var meta = keywordMetadata.wordsTfIdf().get(stemmed); return keywordMetadata.wordsTfIdf().getOrDefault(stemmed, 0);
if (meta != null) {
return meta.tfIdfNormalized();
}
return 0;
} }
private boolean isDetOrAdverbOrVerb(String posTag) { private boolean isDetOrAdverbOrVerb(String posTag) {

View File

@ -11,17 +11,17 @@ import java.util.Objects;
public final class KeywordMetadata { public final class KeywordMetadata {
private static final WordFrequencyData empty = new WordFrequencyData(0, 0); private static final WordFrequencyData empty = new WordFrequencyData(0);
private final HashSet<String> titleKeywords = new HashSet<>(50); private final HashSet<String> titleKeywords = new HashSet<>(50);
private final HashSet<String> subjectKeywords = new HashSet<>(10); private final HashSet<String> subjectKeywords = new HashSet<>(10);
private final HashSet<String> namesKeywords = new HashSet<>(50); private final HashSet<String> namesKeywords = new HashSet<>(50);
private final HashMap<String, WordFrequencyData> wordsTfIdf; private final Object2IntOpenHashMap<String> wordsTfIdf;
private final Object2IntOpenHashMap<String> positionMask; private final Object2IntOpenHashMap<String> positionMask;
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate; private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) { public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f); this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
this.wordsTfIdf = new HashMap<>(10_000); this.wordsTfIdf = new Object2IntOpenHashMap<>(10_000, 0.7f);
this.wordFlagsTemplate = flags; this.wordFlagsTemplate = flags;
} }
@ -31,7 +31,7 @@ public final class KeywordMetadata {
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) { public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty); int tfidf = wordsTfIdf.getOrDefault(stemmed, 0);
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone(); EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
if (subjectKeywords.contains(stemmed)) if (subjectKeywords.contains(stemmed))
@ -44,9 +44,8 @@ public final class KeywordMetadata {
flags.add(EdgePageWordFlags.Title); flags.add(EdgePageWordFlags.Title);
int positions = positionMask.getOrDefault(stemmed, 0); int positions = positionMask.getOrDefault(stemmed, 0);
int count = Math.max(Integer.bitCount(positions), tfidf.count());
return new WordMetadata(tfidf.tfIdfNormalized(), positions, count, flags).encode(); return new WordMetadata(tfidf, positions, flags).encode();
} }
public HashSet<String> titleKeywords() { public HashSet<String> titleKeywords() {
@ -61,7 +60,7 @@ public final class KeywordMetadata {
return namesKeywords; return namesKeywords;
} }
public HashMap<String, WordFrequencyData> wordsTfIdf() { public Object2IntOpenHashMap<String> wordsTfIdf() {
return wordsTfIdf; return wordsTfIdf;
} }

View File

@ -1,4 +1,4 @@
package nu.marginalia.language.model; package nu.marginalia.language.model;
public record WordFrequencyData(int count, int tfIdfNormalized) { } public record WordFrequencyData(int tfIdfNormalized) { }

View File

@ -169,7 +169,7 @@ public class IndexQueryServiceIntegrationTest {
long[] data = new long[factors.length*2]; long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) { for (int i = 0; i < factors.length; i++) {
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
data[2*i + 1] = new WordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); data[2*i + 1] = new WordMetadata(i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
} }
indexJournalWriter.put(header, new IndexJournalEntryData(data)); indexJournalWriter.put(header, new IndexJournalEntryData(data));
@ -182,7 +182,7 @@ public class IndexQueryServiceIntegrationTest {
long[] data = new long[factors.length*2]; long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) { for (int i = 0; i < factors.length; i++) {
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
data[2*i + 1] = new WordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); data[2*i + 1] = new WordMetadata(i % 20, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
} }
indexJournalWriter.put(header, new IndexJournalEntryData(data)); indexJournalWriter.put(header, new IndexJournalEntryData(data));

View File

@ -73,7 +73,7 @@ public class SearchApiQueryService {
continue outer; continue outer;
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet()); Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags)); lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
} }
details.add(lst); details.add(lst);
} }

View File

@ -104,7 +104,7 @@ public class SearchResultValuator {
double sum = 0.; double sum = 0.;
for (var keyword : keywordSet) { for (var keyword : keywordSet) {
double count = Math.min(255, keyword.count()); double count = Math.min(255, Integer.bitCount(keyword.wordMetadata().positions()));
double wt = keyword.weight() * keyword.weight() / keywordSet.length(); double wt = keyword.weight() * keyword.weight() / keywordSet.length();
final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5)); final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5));
@ -313,9 +313,7 @@ public class SearchResultValuator {
public int tfIdf() { public int tfIdf() {
return wordMetadata.tfIdf(); return wordMetadata.tfIdf();
} }
public int count() {
return wordMetadata.count();
}
public EnumSet<EdgePageWordFlags> flags() { public EnumSet<EdgePageWordFlags> flags() {
return wordMetadata.flagSet(); return wordMetadata.flagSet();
} }

View File

@ -85,7 +85,7 @@ class SearchResultValuatorTest {
.reduce((a,b) -> a|b) .reduce((a,b) -> a|b)
.orElse(0); .orElse(0);
return new WordMetadata(tfIdf, posBits, positions.size(), wordFlags).encode(); return new WordMetadata(tfIdf, posBits, wordFlags).encode();
} }
} }