diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java b/marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java new file mode 100644 index 00000000..8b709da0 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java @@ -0,0 +1,94 @@ +package nu.marginalia.util; + +import java.util.List; +import java.util.Set; + +/** This is a very simple locality sensitive hash for collections of Java objects. + *

+ * The resulting LSH is a 64 bit value, whose hamming distance is a measure + * of the similarity of the two collections, where smaller similarities imply + * similarity. + *

+ * It hinges on a lot of relatively sketchy assumptions about Object$hashCode(). + * + */ +public class EasyLSH { + // This parameter determines the amount of shingling. Increasing this + // increases the sensitivity to word ordering, but also magnifies + // the impact of each difference overall. + // + // This must be a power of 2, lest things break + private static final int SHINGLING = 2; + static { assert Integer.bitCount(SHINGLING) == 1; } + + + + private final int[] fields = new int[64]; + private final int[] prevHashes = new int[SHINGLING]; + private int prevHashIdx = 0; + + public void addUnordered(Object o) { + addHashUnordered(o.hashCode()); + } + + public void addOrdered(Object o) { + addHashOrdered(o.hashCode()); + } + + public void addHashOrdered(int hashCode) { + hashCode = shingleHash(hashCode); + addHashUnordered(shingleHash(hashCode)); + } + + public void addHashUnordered(int hashCode) { + int value = 1-(hashCode & 2); + + // Try to extract all the remaining entropy + // into selecting the field to update + + int field = 63 & (((hashCode >>> 2) + ^ (hashCode >>> 10) + ^ (hashCode >>> 18) + ^ (hashCode >>> 26))); + + fields[field] += value; + } + + private int shingleHash(int nextHash) { + prevHashes[prevHashIdx++ & (SHINGLING-1)] = nextHash; + int ret = 0; + for (int hashPart : prevHashes) { + ret ^= hashPart; + } + return ret; + } + + public long get() { + long val = 0; + + for (int f : fields) { + val = (val << 1) | (f >>> 31); + } + + return val; + } + + public int hammingDistance(EasyLSH other) { + return hammingDistance(this, other); + } + + public static int hammingDistance(long a, long b) { + return Long.bitCount(a^b); + } + + public static int hammingDistance(EasyLSH a, EasyLSH b) { + int distance = 0; + + for (int i = 0; i < a.fields.length; i++) { + distance += (a.fields[i] ^ b.fields[i]) >>> 31; + } + + return distance; + } + +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/EasyLSHTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/EasyLSHTest.java new file mode 100644 index 00000000..e8c3f147 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/EasyLSHTest.java @@ -0,0 +1,77 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; + +class EasyLSHTest { + + @Test + public void testEZLSH() { + String sA = """ + In computer science, locality-sensitive hashing (LSH) is an algorithmic technique that hashes similar input items + into the same "buckets" with high probability.[1] (The number of buckets is much smaller than the universe of possible + input items.)[1] Since similar items end up in the same buckets, this technique can be used for data clustering and + nearest neighbor search. It differs from conventional hashing techniques in that hash collisions are maximized, not minimized. + Alternatively, the technique can be seen as a way to reduce the dimensionality of high-dimensional data; high-dimensional input + items can be reduced to low-dimensional versions while preserving relative distances between items.; + """; + + String sB = """ + In computer science, locality-sensitive hashing (LSH) is an algorithmic technique that hashes similar input items + into the same "buckets" with high probability.[1] (The number of buckets is much smaller than the universe of possible + input items.)[1] Since similar items end up in the same buckets, this technique can be used for data clustering and + nearest neighbor search. + + The wrath sing, goddess, of Peleus' son, Achilles, that destructive wrath which brought countless woes upon the Achaeans, + and sent forth to Hades many valiant souls of heroes, and made them themselves spoil for dogs and every bird; thus the plan + of Zeus came to fulfillment, [5] from the time when1 first they parted in strife Atreus' son, king of men, and brilliant Achilles. + Who then of the gods was it that brought these two together to contend? + + It differs from conventional hashing techniques in that hash collisions are maximized, not minimized. + Alternatively, the technique can be seen as a way to reduce the dimensionality of high-dimensional data; high-dimensional input + items can be reduced to low-dimensional versions while preserving relative distances between items.; + """; + + String sC = """ + The wrath sing, goddess, of Peleus' son, Achilles, that destructive wrath which brought countless woes upon the Achaeans, + and sent forth to Hades many valiant souls of heroes, and made them themselves spoil for dogs and every bird; thus the plan + of Zeus came to fulfillment, [5] from the time when1 first they parted in strife Atreus' son, king of men, and brilliant Achilles. + Who then of the gods was it that brought these two together to contend? + """; + + String sD = """ + Quo usque tandem abutere, Catilina, patientia nostra? quam diu etiam furor iste tuus nos eludet? quem ad finem sese effrenata iactabit + audacia? Nihilne te nocturnum praesidium Palati, nihil urbis vigiliae, nihil timor populi, nihil concursus bonorum omnium, nihil hic + munitissimus habendi senatus locus, nihil horum ora voltusque moverunt? Patere tua consilia non sentis, constrictam iam horum omnium + scientia teneri coniurationem tuam non vides? Quid proxima, quid superiore nocte egeris, ubi fueris, quos convocaveris, quid consilii + ceperis, quem nostrum ignorare arbitraris? [2] O tempora, o mores! Senatus haec intellegit. Consul videt; hic tamen vivit. Vivit? immo + vero etiam in senatum venit, fit publici consilii particeps, notat et designat oculis ad caedem unum quemque nostrum. Nos autem fortes + viri satis facere rei publicae videmur, si istius furorem ac tela vitemus. Ad mortem te, Catilina, duci iussu consulis iam pridem oportebat, + in te conferri pestem, quam tu in nos [omnes iam diu] machinaris. + """; + + EasyLSH hashA = new EasyLSH(); + Arrays.stream(sA.split("\\s")).forEach(hashA::addOrdered); + EasyLSH hashB = new EasyLSH(); + Arrays.stream(sB.split("\\s")).forEach(hashB::addOrdered); + EasyLSH hashC = new EasyLSH(); + Arrays.stream(sC.split("\\s")).forEach(hashC::addOrdered); + + EasyLSH hashD = new EasyLSH(); + Arrays.stream(sD.split("\\s")).forEach(hashD::addOrdered); + + System.out.println(Long.toBinaryString(hashA.get())); + System.out.println(Long.toBinaryString(hashB.get())); + System.out.println(Long.toBinaryString(hashC.get())); + System.out.println(Long.toBinaryString(hashD.get())); + + System.out.println(EasyLSH.hammingDistance(hashA, hashB)); + System.out.println(EasyLSH.hammingDistance(hashB, hashC)); + System.out.println(EasyLSH.hammingDistance(hashA, hashC)); + + System.out.println(EasyLSH.hammingDistance(hashA, hashD)); + System.out.println(EasyLSH.hammingDistance(hashB, hashD)); + System.out.println(EasyLSH.hammingDistance(hashC, hashD)); + } +} \ No newline at end of file