MarginaliaSearch/code/libraries/easy-lsh/java/nu/marginalia/lsh/EasyLSH.java

88 lines
2.3 KiB
Java
Raw Normal View History

2023-03-04 12:19:01 +00:00
package nu.marginalia.lsh;
/** This is a very simple locality sensitive hash for collections of Java objects.
* <p>
* The resulting LSH is a 64 bit value, whose hamming distance is a measure
2024-08-21 08:12:00 +00:00
* of the similarity of the two collections, where a smaller value implies
* similarity.
* <p>
* It hinges on a lot of relatively sketchy assumptions about Object$hashCode().
*
*/
public class EasyLSH {
// This parameter determines the amount of shingling. Increasing this
// increases the sensitivity to word ordering, but also magnifies
// the impact of each difference overall.
//
// This must be a power of 2, lest things break
private static final int SHINGLING = 2;
static { assert Integer.bitCount(SHINGLING) == 1; }
private final int[] fields = new int[64];
private final int[] prevHashes = new int[SHINGLING];
private int prevHashIdx = 0;
public void addUnordered(Object o) {
addHashUnordered(o.hashCode());
}
public void addOrdered(Object o) {
addHashOrdered(o.hashCode());
}
public void addHashOrdered(int hashCode) {
addHashUnordered(shingleHash(hashCode));
}
public void addHashUnordered(int hashCode) {
2023-03-06 17:32:13 +00:00
int value = 1 - (hashCode & 2);
// Try to extract all the remaining entropy
// into selecting the field to update
2023-03-04 12:19:01 +00:00
int field = (hashCode >> 2)
^ (hashCode >>> 8)
^ (hashCode >>> 14)
^ (hashCode >>> 20)
^ (hashCode >>> 26);
2023-03-04 12:19:01 +00:00
fields[field & 63] += value;
}
private int shingleHash(int nextHash) {
prevHashes[prevHashIdx++ & (SHINGLING-1)] = nextHash;
2023-03-04 12:19:01 +00:00
int ret = 0;
for (int hashPart : prevHashes) {
2023-03-04 12:19:01 +00:00
ret = hashPart ^ ret;
}
2023-03-04 12:19:01 +00:00
return ret;
}
public long get() {
long val = 0;
for (int f : fields) {
val = (val << 1) | (f >>> 31);
}
return val;
}
public static int hammingDistance(long a, long b) {
return Long.bitCount(a^b);
}
public static int hammingDistance(EasyLSH a, EasyLSH b) {
int distance = 0;
for (int i = 0; i < a.fields.length; i++) {
distance += (a.fields[i] ^ b.fields[i]) >>> 31;
}
return distance;
}
}