From efb46cc7031cd29965a294f95a6b75113471aa20 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 9 Mar 2023 18:14:14 +0100 Subject: [PATCH] Remove count from WordMetadata entirely. --- .../nu/marginalia/model/idx/WordMetadata.java | 19 ++--------- .../nu/marginalia/model/WordMetadataTest.java | 33 ++++--------------- .../processor/keywords/KeywordCounter.java | 4 +-- .../processor/keywords/SubjectCounter.java | 7 +--- .../language/model/KeywordMetadata.java | 13 ++++---- .../language/model/WordFrequencyData.java | 2 +- .../svc/IndexQueryServiceIntegrationTest.java | 4 +-- .../search/svc/SearchApiQueryService.java | 2 +- .../valuation/SearchResultValuator.java | 6 ++-- .../valuation/SearchResultValuatorTest.java | 2 +- 10 files changed, 26 insertions(+), 66 deletions(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java index fa8dfd64..7a7551f6 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java @@ -11,18 +11,9 @@ import static java.lang.Math.min; public record WordMetadata(int tfIdf, int positions, - int count, byte flags) { - public WordMetadata { - if (WordMetadata.class.desiredAssertionStatus()) { - if (Integer.bitCount(positions) > count) { - System.err.println(Integer.bitCount(positions) + ">" + count); - } - } - } - public static final long COUNT_MASK = 0xFL; - public static final int COUNT_SHIFT = 8; + // 8 unsused bits at the beginning public static final long TF_IDF_MASK = 0xFFFFL; public static final int TF_IDF_SHIFT = 16; @@ -41,17 +32,15 @@ public record WordMetadata(int tfIdf, this( (int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK), (int)((value >>> POSITIONS_SHIFT) & POSITIONS_MASK), - Integer.bitCount((int) ((value >>> POSITIONS_SHIFT) & POSITIONS_MASK)), (byte) (value & FLAGS_MASK) ); } public WordMetadata(int tfIdf, int positions, - int count, Set flags) { - this(tfIdf, positions, count, encodeFlags(flags)); + this(tfIdf, positions, encodeFlags(flags)); } private static byte encodeFlags(Set flags) { @@ -82,7 +71,6 @@ public record WordMetadata(int tfIdf, StringBuilder sb = new StringBuilder(getClass().getSimpleName()); sb.append('[') .append("tfidf=").append(tfIdf).append(", ") - .append("count=").append(count).append(", ") .append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']'); sb.append(", flags=").append(flags).append(']'); return sb.toString(); @@ -95,14 +83,13 @@ public record WordMetadata(int tfIdf, ret |= Byte.toUnsignedLong(flags); ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT; - ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT; ret |= ((long)(positions)) << POSITIONS_SHIFT; return ret; } public boolean isEmpty() { - return count == 0 && positions == 0 && flags == 0 && tfIdf == 0; + return positions == 0 && flags == 0 && tfIdf == 0; } public static long emptyValue() { diff --git a/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java b/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java index 6ca2d2b8..6f612374 100644 --- a/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java +++ b/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java @@ -12,16 +12,16 @@ class WordMetadataTest { @Test public void codecTest() { - verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class))); - verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class))); - verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class))); - System.out.println(new WordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class))); - System.out.println(new WordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class))); + verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, EnumSet.allOf(EdgePageWordFlags.class))); + verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, EnumSet.allOf(EdgePageWordFlags.class))); + verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, EnumSet.noneOf(EdgePageWordFlags.class))); + System.out.println(new WordMetadata(32, 0x7f0f0005, EnumSet.allOf(EdgePageWordFlags.class))); + System.out.println(new WordMetadata(32, 0xff0f0013, EnumSet.noneOf(EdgePageWordFlags.class))); } @Test public void testClampTfIdfLow() { - var original = new WordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class)); + var original = new WordMetadata(0x8000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class)); var encoded = new WordMetadata(original.encode()); assertEquals(original.positions(), encoded.positions()); @@ -30,32 +30,13 @@ class WordMetadataTest { @Test public void testClampTfIdfHigh() { - var original = new WordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class)); + var original = new WordMetadata(0x7000FFFF, 0, EnumSet.noneOf(EdgePageWordFlags.class)); var encoded = new WordMetadata(original.encode()); assertEquals(original.positions(), encoded.positions()); assertEquals(65535, encoded.tfIdf()); } - @Test - public void testClampCountLow() { - var original = new WordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class)); - var encoded = new WordMetadata(original.encode()); - - assertEquals(original.positions(), encoded.positions()); - assertEquals(0, encoded.count()); - } - - @Test - public void testClampCountHigh() { - var original = new WordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class)); - var encoded = new WordMetadata(original.encode()); - - assertEquals(original.positions(), encoded.positions()); - assertEquals(15, encoded.count()); - } - - public void verifyCodec(String message, WordMetadata data) { assertEquals(data, new WordMetadata(data.encode()), message); } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java index c153be0b..91846a9b 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java @@ -52,7 +52,7 @@ public class KeywordCounter { } } - HashMap tfIdf = keywordMetadata.wordsTfIdf(); + Object2IntOpenHashMap tfIdf = keywordMetadata.wordsTfIdf(); List tfIdfHigh = new ArrayList<>(); int maxVal = maxValue(counts); @@ -61,7 +61,7 @@ public class KeywordCounter { counts.forEach((key, cnt) -> { int value = getTermValue(key, cnt, maxVal); - tfIdf.put(key, new WordFrequencyData(cnt, value)); + tfIdf.put(key, value); if (cnt > 1 && value > 100) { tfIdfHigh.addAll(instances.get(key)); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java index e99cbb5c..cb77d526 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java @@ -81,12 +81,7 @@ public class SubjectCounter { return sum / parts.length; } - var meta = keywordMetadata.wordsTfIdf().get(stemmed); - if (meta != null) { - return meta.tfIdfNormalized(); - } - - return 0; + return keywordMetadata.wordsTfIdf().getOrDefault(stemmed, 0); } private boolean isDetOrAdverbOrVerb(String posTag) { diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java index c18fb5da..14ef7268 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java @@ -11,17 +11,17 @@ import java.util.Objects; public final class KeywordMetadata { - private static final WordFrequencyData empty = new WordFrequencyData(0, 0); + private static final WordFrequencyData empty = new WordFrequencyData(0); private final HashSet titleKeywords = new HashSet<>(50); private final HashSet subjectKeywords = new HashSet<>(10); private final HashSet namesKeywords = new HashSet<>(50); - private final HashMap wordsTfIdf; + private final Object2IntOpenHashMap wordsTfIdf; private final Object2IntOpenHashMap positionMask; private final EnumSet wordFlagsTemplate; public KeywordMetadata(EnumSet flags) { this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f); - this.wordsTfIdf = new HashMap<>(10_000); + this.wordsTfIdf = new Object2IntOpenHashMap<>(10_000, 0.7f); this.wordFlagsTemplate = flags; } @@ -31,7 +31,7 @@ public final class KeywordMetadata { public long getMetadataForWord(EnumSet flagsTemplate, String stemmed) { - WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty); + int tfidf = wordsTfIdf.getOrDefault(stemmed, 0); EnumSet flags = flagsTemplate.clone(); if (subjectKeywords.contains(stemmed)) @@ -44,9 +44,8 @@ public final class KeywordMetadata { flags.add(EdgePageWordFlags.Title); int positions = positionMask.getOrDefault(stemmed, 0); - int count = Math.max(Integer.bitCount(positions), tfidf.count()); - return new WordMetadata(tfidf.tfIdfNormalized(), positions, count, flags).encode(); + return new WordMetadata(tfidf, positions, flags).encode(); } public HashSet titleKeywords() { @@ -61,7 +60,7 @@ public final class KeywordMetadata { return namesKeywords; } - public HashMap wordsTfIdf() { + public Object2IntOpenHashMap wordsTfIdf() { return wordsTfIdf; } diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordFrequencyData.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordFrequencyData.java index 3435a702..fd201682 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordFrequencyData.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordFrequencyData.java @@ -1,4 +1,4 @@ package nu.marginalia.language.model; -public record WordFrequencyData(int count, int tfIdfNormalized) { } \ No newline at end of file +public record WordFrequencyData(int tfIdfNormalized) { } \ No newline at end of file diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index 0b3e198a..0d33c294 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -169,7 +169,7 @@ public class IndexQueryServiceIntegrationTest { long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); - data[2*i + 1] = new WordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); + data[2*i + 1] = new WordMetadata(i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); } indexJournalWriter.put(header, new IndexJournalEntryData(data)); @@ -182,7 +182,7 @@ public class IndexQueryServiceIntegrationTest { long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); - data[2*i + 1] = new WordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); + data[2*i + 1] = new WordMetadata(i % 20, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); } indexJournalWriter.put(header, new IndexJournalEntryData(data)); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java index d00fa7fc..8c2e6038 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java @@ -73,7 +73,7 @@ public class SearchApiQueryService { continue outer; Set flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet()); - lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags)); + lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags)); } details.add(lst); } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java index 3cf1bfbb..1ae99928 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java @@ -104,7 +104,7 @@ public class SearchResultValuator { double sum = 0.; for (var keyword : keywordSet) { - double count = Math.min(255, keyword.count()); + double count = Math.min(255, Integer.bitCount(keyword.wordMetadata().positions())); double wt = keyword.weight() * keyword.weight() / keywordSet.length(); final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5)); @@ -313,9 +313,7 @@ public class SearchResultValuator { public int tfIdf() { return wordMetadata.tfIdf(); } - public int count() { - return wordMetadata.count(); - } + public EnumSet flags() { return wordMetadata.flagSet(); } diff --git a/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java b/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java index ec23af7f..34637872 100644 --- a/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java +++ b/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java @@ -85,7 +85,7 @@ class SearchResultValuatorTest { .reduce((a,b) -> a|b) .orElse(0); - return new WordMetadata(tfIdf, posBits, positions.size(), wordFlags).encode(); + return new WordMetadata(tfIdf, posBits, wordFlags).encode(); } } \ No newline at end of file