From 86a5cc5c5f3bd4fd81fb30371a7865f362ca300c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 14:57:40 +0200 Subject: [PATCH] (hash) Modified version of common codec's Murmur3 hash --- settings.gradle | 2 + third-party/README.md | 1 + third-party/commons-codec/build.gradle | 20 ++ third-party/commons-codec/readme.md | 34 +++ .../nu/marginalia/hash/MurmurHashBench.java | 105 +++++++ .../nu/marginalia/hash/MurmurHash3_128.java | 277 ++++++++++++++++++ 6 files changed, 439 insertions(+) create mode 100644 third-party/commons-codec/build.gradle create mode 100644 third-party/commons-codec/readme.md create mode 100644 third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java create mode 100644 third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java diff --git a/settings.gradle b/settings.gradle index 131b449e..62bc0f34 100644 --- a/settings.gradle +++ b/settings.gradle @@ -80,6 +80,7 @@ include 'third-party:openzim' include 'third-party:count-min-sketch' include 'third-party:monkey-patch-opennlp' include 'third-party:monkey-patch-gson' +include 'third-party:commons-codec' dependencyResolutionManagement { @@ -142,6 +143,7 @@ dependencyResolutionManagement { library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0') library('commons.compress','org.apache.commons','commons-compress').version('1.21') library('commons.io','commons-io','commons-io').version('2.11.0') + library('commons.codec', 'commons-codec', 'commons-codec').version('1.16.0') library('ffi','com.github.jnr','jnr-ffi').version('2.2.12') library('databind','com.fasterxml.jackson.core','jackson-databind').version('2.13.2.1') diff --git a/third-party/README.md b/third-party/README.md index c31ca585..d6b8a834 100644 --- a/third-party/README.md +++ b/third-party/README.md @@ -10,6 +10,7 @@ or lack an artifact, or to override some default that is inappropriate for the t * [PorterStemmer](porterstemmer/) - LGPL3 * [Uppend](uppend/) - MIT * [OpenZIM](openzim/) - GPL-2.0 +* [Commons Codec](commons-codec/) - Apache 2.0 ### Repackaged * [SymSpell](symspell/) - LGPL-3.0 diff --git a/third-party/commons-codec/build.gradle b/third-party/commons-codec/build.gradle new file mode 100644 index 00000000..600269c8 --- /dev/null +++ b/third-party/commons-codec/build.gradle @@ -0,0 +1,20 @@ +plugins { + id 'java' + id "me.champeau.jmh" version "0.6.6" +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + jmhImplementation project(':code:libraries:language-processing') + jmhImplementation libs.guava + jmhImplementation libs.commons.codec +} + +test { + useJUnitPlatform() +} diff --git a/third-party/commons-codec/readme.md b/third-party/commons-codec/readme.md new file mode 100644 index 00000000..71232ae7 --- /dev/null +++ b/third-party/commons-codec/readme.md @@ -0,0 +1,34 @@ +# Commons Codec + +License: [APL 2.0](http://www.apache.org/licenses/LICENSE-2.0) + +This package contains a heavily modified version of the Murmur3 hash from [commons-codec](https://commons.apache.org/proper/commons-codec/) +that cuts some corners but outperforms both Commons Codec and Guava fairly significantly for the particular use cases +we care about being fast: Hashing ASCII/Latin1 strings into a well behaving 64-bit hash. + +The method `hashLowerBytes(String data)` performs a zero allocation and zero conversion hash of +the *lower bytes* of the characters in the provided string. For ASCII, Latin1, or other 8 bit encodings +this is identical to hashing the entire string. For other use cases, especially away from the +Latin scripts, this function is possibly a foot-gun. + +The method `hashNearlyASCII(String data)` is the same as above, except it's +seeded with Java String's hashCode(). This is a very non-standard modification that +makes it a bit better at dealing with other encodings without measurable performance +impact. + +The method `long hash(byte[] data)` hashes the entire byte array. + +A non-standard behavior is that the hash function folds the 128 bit +hash into a 64 bit hash by xor:ing the 128 bit parts. + +## Performance Benchmarks + +| Algorithm | Ops/s | Remark | +|--------------------|-------------------|-----------------------------------------------------------------| +| Guava | 12,114 ± 439 | allocates byte buffers internally | +| Common Codec | 29,224 ± 1,080 | String.getByte() penalty, long\[2\] allocation, possibly elided | +| MH hash | 30,885 ± 847 | String.getByte() penalty, zero allocations | +| MH hashNearlyASCII | 50,018 ± 399 | Zero allocations, worse characteristics outside Latin1/ASCII | +| MH hashLowerBytes | 50,533 ± 478 | Zero allocations, only works for Latin1/ASCII | +| String.hashCode() | 567,381 ± 136,185 | Zero allocations, much weaker algo | + diff --git a/third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java b/third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java new file mode 100644 index 00000000..a4cc3029 --- /dev/null +++ b/third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java @@ -0,0 +1,105 @@ +package nu.marginalia.hash; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import org.apache.commons.codec.digest.MurmurHash3; +import org.openjdk.jmh.annotations.*; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class MurmurHashBench { + + private static HashFunction guavaHashFunction = Hashing.murmur3_128(); + private static MurmurHash3_128 marginaliahash = new MurmurHash3_128(); + + @State(Scope.Benchmark) + public static class BenchState { + + List strings; + + @Setup(Level.Trial) + public void doSetup() { + strings = new ArrayList<>(); + try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"), + "Could not load word frequency table"); + var br = new BufferedReader(new InputStreamReader(resource)) + ) { + for (;;) { + String s = br.readLine(); + if (s == null) { + break; + } + strings.add(s.toLowerCase()); + } + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + } + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchGuava(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += guavaHashFunction.hashUnencodedChars(string).padToLong(); + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchCommonCodec(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += MurmurHash3.hash128x64(string.getBytes(StandardCharsets.UTF_8))[0]; + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchMarginalia_hashNonStandardASCIIOnlyDirect(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += marginaliahash.hashLowerBytes(string); + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchMarginalia_hashStandard(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += marginaliahash.hash(string.getBytes(StandardCharsets.UTF_8)); + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchJavaStringHash(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += string.hashCode(); + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchWeakNonAscii(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += marginaliahash.hashNearlyASCII(string); + } + return total; + } +} diff --git a/third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java b/third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java new file mode 100644 index 00000000..cd767d10 --- /dev/null +++ b/third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java @@ -0,0 +1,277 @@ +package nu.marginalia.hash; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** A modified version of Commons Codec's murmur hash + * that minimizes allocations. + * */ +public class MurmurHash3_128 { + + /** + * A default seed to use for the murmur hash algorithm. + * Has the value {@code 104729}. + */ + public static final int DEFAULT_SEED = 104729; + + // Constants for 128-bit variant + private static final long C1 = 0x87c37b91114253d5L; + private static final long C2 = 0x4cf5ad432745937fL; + private static final int R1 = 31; + private static final int R2 = 27; + private static final int R3 = 33; + private static final int M = 5; + private static final int N1 = 0x52dce729; + private static final int N2 = 0x38495ab5; + + /** Assumes data is ASCII, or at the very least that you only care about the lower + * bytes of your string (which may be fine for hashing mostly latin script). + *

+ * Fold the 128 bit hash into 64 bits by xor:ing msw and lsw + */ + public long hashLowerBytes(String data) { + return hash64(data, 0, data.length(), DEFAULT_SEED); + } + + /** Like hashASCIIOnly except seeded with the Java String.hashCode() + * to provide better behavior for non-ASCII strings. It's much worse + * than doing it properly, but better than not doing this. + */ + public long hashNearlyASCII(String data) { + return hash64(data, 0, data.length(), data.hashCode()); + } + + /** Hash the bytes; fold the 128 bit hash into 64 bits by xor:ing msw and lsw */ + public long hash(byte[] data) { + return hash64(data, 0, data.length, DEFAULT_SEED); + } + + private static long hash64(final CharSequence data, final int offset, final int length, final long seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = offset + (i << 4); + long k1 = getLittleEndianLong(data, index); + long k2 = getLittleEndianLong(data, index + 8); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + final int index = offset + (nblocks << 4); + switch (offset + length - index) { + case 15: + k2 ^= ((long) data.charAt(index + 14) & 0xff) << 48; + case 14: + k2 ^= ((long) data.charAt(index + 13) & 0xff) << 40; + case 13: + k2 ^= ((long) data.charAt(index + 12) & 0xff) << 32; + case 12: + k2 ^= ((long) data.charAt(index + 11) & 0xff) << 24; + case 11: + k2 ^= ((long) data.charAt(index + 10) & 0xff) << 16; + case 10: + k2 ^= ((long) data.charAt(index + 9) & 0xff) << 8; + case 9: + k2 ^= data.charAt(index + 8) & 0xff; + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= ((long) data.charAt(index + 7) & 0xff) << 56; + case 7: + k1 ^= ((long) data.charAt(index + 6) & 0xff) << 48; + case 6: + k1 ^= ((long) data.charAt(index + 5) & 0xff) << 40; + case 5: + k1 ^= ((long) data.charAt(index + 4) & 0xff) << 32; + case 4: + k1 ^= ((long) data.charAt(index + 3) & 0xff) << 24; + case 3: + k1 ^= ((long) data.charAt(index + 2) & 0xff) << 16; + case 2: + k1 ^= ((long) data.charAt(index + 1) & 0xff) << 8; + case 1: + k1 ^= data.charAt(index) & 0xff; + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return h1^h2; // non-standard 128->64 bit transformation + } + + private static long hash64(final byte[] data, final int offset, final int length, final long seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = offset + (i << 4); + long k1 = getLittleEndianLong(data, index); + long k2 = getLittleEndianLong(data, index + 8); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + final int index = offset + (nblocks << 4); + switch (offset + length - index) { + case 15: + k2 ^= ((long) data[index + 14] & 0xff) << 48; + case 14: + k2 ^= ((long) data[index + 13] & 0xff) << 40; + case 13: + k2 ^= ((long) data[index + 12] & 0xff) << 32; + case 12: + k2 ^= ((long) data[index + 11] & 0xff) << 24; + case 11: + k2 ^= ((long) data[index + 10] & 0xff) << 16; + case 10: + k2 ^= ((long) data[index + 9] & 0xff) << 8; + case 9: + k2 ^= data[index + 8] & 0xff; + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= ((long) data[index + 7] & 0xff) << 56; + case 7: + k1 ^= ((long) data[index + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[index + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[index + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[index + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[index + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[index + 1] & 0xff) << 8; + case 1: + k1 ^= data[index] & 0xff; + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return h1^h2; // non-standard 128->64 bit transformation + } + + private static long getLittleEndianLong(final CharSequence data, final int index) { + return (((long) data.charAt(index ) & 0xff) ) | + (((long) data.charAt(index + 1) & 0xff) << 8) | + (((long) data.charAt(index + 2) & 0xff) << 16) | + (((long) data.charAt(index + 3) & 0xff) << 24) | + (((long) data.charAt(index + 4) & 0xff) << 32) | + (((long) data.charAt(index + 5) & 0xff) << 40) | + (((long) data.charAt(index + 6) & 0xff) << 48) | + (((long) data.charAt(index + 7) & 0xff) << 56); + } + + private static long getLittleEndianLong(final byte[] data, final int index) { + return (((long) data[index ] & 0xff) ) | + (((long) data[index + 1] & 0xff) << 8) | + (((long) data[index + 2] & 0xff) << 16) | + (((long) data[index + 3] & 0xff) << 24) | + (((long) data[index + 4] & 0xff) << 32) | + (((long) data[index + 5] & 0xff) << 40) | + (((long) data[index + 6] & 0xff) << 48) | + (((long) data[index + 7] & 0xff) << 56); + } + private static long fmix64(long hash) { + hash ^= (hash >>> 33); + hash *= 0xff51afd7ed558ccdL; + hash ^= (hash >>> 33); + hash *= 0xc4ceb9fe1a85ec53L; + hash ^= (hash >>> 33); + return hash; + } + +}