mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Remove count from WordMetadata entirely.
This commit is contained in:
parent
8fb531c614
commit
efb46cc703
@ -11,18 +11,9 @@ import static java.lang.Math.min;
|
|||||||
|
|
||||||
public record WordMetadata(int tfIdf,
|
public record WordMetadata(int tfIdf,
|
||||||
int positions,
|
int positions,
|
||||||
int count,
|
|
||||||
byte flags) {
|
byte flags) {
|
||||||
public WordMetadata {
|
|
||||||
if (WordMetadata.class.desiredAssertionStatus()) {
|
|
||||||
if (Integer.bitCount(positions) > count) {
|
|
||||||
System.err.println(Integer.bitCount(positions) + ">" + count);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static final long COUNT_MASK = 0xFL;
|
// 8 unsused bits at the beginning
|
||||||
public static final int COUNT_SHIFT = 8;
|
|
||||||
|
|
||||||
public static final long TF_IDF_MASK = 0xFFFFL;
|
public static final long TF_IDF_MASK = 0xFFFFL;
|
||||||
public static final int TF_IDF_SHIFT = 16;
|
public static final int TF_IDF_SHIFT = 16;
|
||||||
@ -41,17 +32,15 @@ public record WordMetadata(int tfIdf,
|
|||||||
this(
|
this(
|
||||||
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
|
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
|
||||||
(int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
|
(int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
|
||||||
Integer.bitCount((int) ((value >>> POSITIONS_SHIFT) & POSITIONS_MASK)),
|
|
||||||
(byte) (value & FLAGS_MASK)
|
(byte) (value & FLAGS_MASK)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public WordMetadata(int tfIdf,
|
public WordMetadata(int tfIdf,
|
||||||
int positions,
|
int positions,
|
||||||
int count,
|
|
||||||
Set<EdgePageWordFlags> flags)
|
Set<EdgePageWordFlags> flags)
|
||||||
{
|
{
|
||||||
this(tfIdf, positions, count, encodeFlags(flags));
|
this(tfIdf, positions, encodeFlags(flags));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static byte encodeFlags(Set<EdgePageWordFlags> flags) {
|
private static byte encodeFlags(Set<EdgePageWordFlags> flags) {
|
||||||
@ -82,7 +71,6 @@ public record WordMetadata(int tfIdf,
|
|||||||
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
|
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
|
||||||
sb.append('[')
|
sb.append('[')
|
||||||
.append("tfidf=").append(tfIdf).append(", ")
|
.append("tfidf=").append(tfIdf).append(", ")
|
||||||
.append("count=").append(count).append(", ")
|
|
||||||
.append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
|
.append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
|
||||||
sb.append(", flags=").append(flags).append(']');
|
sb.append(", flags=").append(flags).append(']');
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
@ -95,14 +83,13 @@ public record WordMetadata(int tfIdf,
|
|||||||
|
|
||||||
ret |= Byte.toUnsignedLong(flags);
|
ret |= Byte.toUnsignedLong(flags);
|
||||||
ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT;
|
ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT;
|
||||||
ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT;
|
|
||||||
ret |= ((long)(positions)) << POSITIONS_SHIFT;
|
ret |= ((long)(positions)) << POSITIONS_SHIFT;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
return count == 0 && positions == 0 && flags == 0 && tfIdf == 0;
|
return positions == 0 && flags == 0 && tfIdf == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static long emptyValue() {
|
public static long emptyValue() {
|
||||||
|
@ -12,16 +12,16 @@ class WordMetadataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void codecTest() {
|
public void codecTest() {
|
||||||
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||||
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||||
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
|
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(EdgePageWordFlags.class)));
|
||||||
System.out.println(new WordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||||
System.out.println(new WordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
|
System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(EdgePageWordFlags.class)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testClampTfIdfLow() {
|
public void testClampTfIdfLow() {
|
||||||
var original = new WordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
|
var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
var encoded = new WordMetadata(original.encode());
|
var encoded = new WordMetadata(original.encode());
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
assertEquals(original.positions(), encoded.positions());
|
||||||
@ -30,32 +30,13 @@ class WordMetadataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testClampTfIdfHigh() {
|
public void testClampTfIdfHigh() {
|
||||||
var original = new WordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
|
var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
var encoded = new WordMetadata(original.encode());
|
var encoded = new WordMetadata(original.encode());
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
assertEquals(original.positions(), encoded.positions());
|
||||||
assertEquals(65535, encoded.tfIdf());
|
assertEquals(65535, encoded.tfIdf());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testClampCountLow() {
|
|
||||||
var original = new WordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class));
|
|
||||||
var encoded = new WordMetadata(original.encode());
|
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
|
||||||
assertEquals(0, encoded.count());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testClampCountHigh() {
|
|
||||||
var original = new WordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class));
|
|
||||||
var encoded = new WordMetadata(original.encode());
|
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
|
||||||
assertEquals(15, encoded.count());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void verifyCodec(String message, WordMetadata data) {
|
public void verifyCodec(String message, WordMetadata data) {
|
||||||
assertEquals(data, new WordMetadata(data.encode()), message);
|
assertEquals(data, new WordMetadata(data.encode()), message);
|
||||||
}
|
}
|
||||||
|
@ -52,7 +52,7 @@ public class KeywordCounter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
HashMap<String, WordFrequencyData> tfIdf = keywordMetadata.wordsTfIdf();
|
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf();
|
||||||
List<WordRep> tfIdfHigh = new ArrayList<>();
|
List<WordRep> tfIdfHigh = new ArrayList<>();
|
||||||
|
|
||||||
int maxVal = maxValue(counts);
|
int maxVal = maxValue(counts);
|
||||||
@ -61,7 +61,7 @@ public class KeywordCounter {
|
|||||||
counts.forEach((key, cnt) -> {
|
counts.forEach((key, cnt) -> {
|
||||||
int value = getTermValue(key, cnt, maxVal);
|
int value = getTermValue(key, cnt, maxVal);
|
||||||
|
|
||||||
tfIdf.put(key, new WordFrequencyData(cnt, value));
|
tfIdf.put(key, value);
|
||||||
|
|
||||||
if (cnt > 1 && value > 100) {
|
if (cnt > 1 && value > 100) {
|
||||||
tfIdfHigh.addAll(instances.get(key));
|
tfIdfHigh.addAll(instances.get(key));
|
||||||
|
@ -81,12 +81,7 @@ public class SubjectCounter {
|
|||||||
return sum / parts.length;
|
return sum / parts.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
var meta = keywordMetadata.wordsTfIdf().get(stemmed);
|
return keywordMetadata.wordsTfIdf().getOrDefault(stemmed, 0);
|
||||||
if (meta != null) {
|
|
||||||
return meta.tfIdfNormalized();
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isDetOrAdverbOrVerb(String posTag) {
|
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||||
|
@ -11,17 +11,17 @@ import java.util.Objects;
|
|||||||
|
|
||||||
public final class KeywordMetadata {
|
public final class KeywordMetadata {
|
||||||
|
|
||||||
private static final WordFrequencyData empty = new WordFrequencyData(0, 0);
|
private static final WordFrequencyData empty = new WordFrequencyData(0);
|
||||||
private final HashSet<String> titleKeywords = new HashSet<>(50);
|
private final HashSet<String> titleKeywords = new HashSet<>(50);
|
||||||
private final HashSet<String> subjectKeywords = new HashSet<>(10);
|
private final HashSet<String> subjectKeywords = new HashSet<>(10);
|
||||||
private final HashSet<String> namesKeywords = new HashSet<>(50);
|
private final HashSet<String> namesKeywords = new HashSet<>(50);
|
||||||
private final HashMap<String, WordFrequencyData> wordsTfIdf;
|
private final Object2IntOpenHashMap<String> wordsTfIdf;
|
||||||
private final Object2IntOpenHashMap<String> positionMask;
|
private final Object2IntOpenHashMap<String> positionMask;
|
||||||
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
|
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
|
||||||
|
|
||||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||||
this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||||
this.wordsTfIdf = new HashMap<>(10_000);
|
this.wordsTfIdf = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||||
this.wordFlagsTemplate = flags;
|
this.wordFlagsTemplate = flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -31,7 +31,7 @@ public final class KeywordMetadata {
|
|||||||
|
|
||||||
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||||
|
|
||||||
WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
int tfidf = wordsTfIdf.getOrDefault(stemmed, 0);
|
||||||
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
||||||
|
|
||||||
if (subjectKeywords.contains(stemmed))
|
if (subjectKeywords.contains(stemmed))
|
||||||
@ -44,9 +44,8 @@ public final class KeywordMetadata {
|
|||||||
flags.add(EdgePageWordFlags.Title);
|
flags.add(EdgePageWordFlags.Title);
|
||||||
|
|
||||||
int positions = positionMask.getOrDefault(stemmed, 0);
|
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||||
int count = Math.max(Integer.bitCount(positions), tfidf.count());
|
|
||||||
|
|
||||||
return new WordMetadata(tfidf.tfIdfNormalized(), positions, count, flags).encode();
|
return new WordMetadata(tfidf, positions, flags).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
public HashSet<String> titleKeywords() {
|
public HashSet<String> titleKeywords() {
|
||||||
@ -61,7 +60,7 @@ public final class KeywordMetadata {
|
|||||||
return namesKeywords;
|
return namesKeywords;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HashMap<String, WordFrequencyData> wordsTfIdf() {
|
public Object2IntOpenHashMap<String> wordsTfIdf() {
|
||||||
return wordsTfIdf;
|
return wordsTfIdf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.language.model;
|
package nu.marginalia.language.model;
|
||||||
|
|
||||||
|
|
||||||
public record WordFrequencyData(int count, int tfIdfNormalized) { }
|
public record WordFrequencyData(int tfIdfNormalized) { }
|
@ -169,7 +169,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
||||||
data[2*i + 1] = new WordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
data[2*i + 1] = new WordMetadata(i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||||
@ -182,7 +182,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
||||||
data[2*i + 1] = new WordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
data[2*i + 1] = new WordMetadata(i % 20, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||||
|
@ -73,7 +73,7 @@ public class SearchApiQueryService {
|
|||||||
continue outer;
|
continue outer;
|
||||||
|
|
||||||
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
||||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags));
|
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
|
||||||
}
|
}
|
||||||
details.add(lst);
|
details.add(lst);
|
||||||
}
|
}
|
||||||
|
@ -104,7 +104,7 @@ public class SearchResultValuator {
|
|||||||
double sum = 0.;
|
double sum = 0.;
|
||||||
|
|
||||||
for (var keyword : keywordSet) {
|
for (var keyword : keywordSet) {
|
||||||
double count = Math.min(255, keyword.count());
|
double count = Math.min(255, Integer.bitCount(keyword.wordMetadata().positions()));
|
||||||
double wt = keyword.weight() * keyword.weight() / keywordSet.length();
|
double wt = keyword.weight() * keyword.weight() / keywordSet.length();
|
||||||
|
|
||||||
final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5));
|
final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5));
|
||||||
@ -313,9 +313,7 @@ public class SearchResultValuator {
|
|||||||
public int tfIdf() {
|
public int tfIdf() {
|
||||||
return wordMetadata.tfIdf();
|
return wordMetadata.tfIdf();
|
||||||
}
|
}
|
||||||
public int count() {
|
|
||||||
return wordMetadata.count();
|
|
||||||
}
|
|
||||||
public EnumSet<EdgePageWordFlags> flags() {
|
public EnumSet<EdgePageWordFlags> flags() {
|
||||||
return wordMetadata.flagSet();
|
return wordMetadata.flagSet();
|
||||||
}
|
}
|
||||||
|
@ -85,7 +85,7 @@ class SearchResultValuatorTest {
|
|||||||
.reduce((a,b) -> a|b)
|
.reduce((a,b) -> a|b)
|
||||||
.orElse(0);
|
.orElse(0);
|
||||||
|
|
||||||
return new WordMetadata(tfIdf, posBits, positions.size(), wordFlags).encode();
|
return new WordMetadata(tfIdf, posBits, wordFlags).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user