MarginaliaSearch/code/libraries/easy-lsh/java/nu/marginalia/lsh/EasyLSH.java
Viktor Lofgren 1d34224416 (refac) Remove src/main from all source code paths.
Look, this will make the git history look funny, but trimming unnecessary depth from the source tree is a very necessary sanity-preserving measure when dealing with a super-modularized codebase like this one.

While it makes the project configuration a bit less conventional, it will save you several clicks every time you jump between modules.  Which you'll do a lot, because it's *modul*ar.  The src/main/java convention makes a lot of sense for a non-modular project though.  This ain't that.
2024-02-23 16:13:40 +01:00

88 lines
2.3 KiB
Java

package nu.marginalia.lsh;
/** This is a very simple locality sensitive hash for collections of Java objects.
* <p>
* The resulting LSH is a 64 bit value, whose hamming distance is a measure
* of the similarity of the two collections, where smaller similarities imply
* similarity.
* <p>
* It hinges on a lot of relatively sketchy assumptions about Object$hashCode().
*
*/
public class EasyLSH {
// This parameter determines the amount of shingling. Increasing this
// increases the sensitivity to word ordering, but also magnifies
// the impact of each difference overall.
//
// This must be a power of 2, lest things break
private static final int SHINGLING = 2;
static { assert Integer.bitCount(SHINGLING) == 1; }
private final int[] fields = new int[64];
private final int[] prevHashes = new int[SHINGLING];
private int prevHashIdx = 0;
public void addUnordered(Object o) {
addHashUnordered(o.hashCode());
}
public void addOrdered(Object o) {
addHashOrdered(o.hashCode());
}
public void addHashOrdered(int hashCode) {
addHashUnordered(shingleHash(hashCode));
}
public void addHashUnordered(int hashCode) {
int value = 1 - (hashCode & 2);
// Try to extract all the remaining entropy
// into selecting the field to update
int field = (hashCode >> 2)
^ (hashCode >>> 8)
^ (hashCode >>> 14)
^ (hashCode >>> 20)
^ (hashCode >>> 26);
fields[field & 63] += value;
}
private int shingleHash(int nextHash) {
prevHashes[prevHashIdx++ & (SHINGLING-1)] = nextHash;
int ret = 0;
for (int hashPart : prevHashes) {
ret = hashPart ^ ret;
}
return ret;
}
public long get() {
long val = 0;
for (int f : fields) {
val = (val << 1) | (f >>> 31);
}
return val;
}
public static int hammingDistance(long a, long b) {
return Long.bitCount(a^b);
}
public static int hammingDistance(EasyLSH a, EasyLSH b) {
int distance = 0;
for (int i = 0; i < a.fields.length; i++) {
distance += (a.fields[i] ^ b.fields[i]) >>> 31;
}
return distance;
}
}