mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
Simple implementation of a locality-sensitive hash for text word.
This commit is contained in:
parent
ff30de7352
commit
b5805063e0
94
marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java
Normal file
94
marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java
Normal file
@ -0,0 +1,94 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/** This is a very simple locality sensitive hash for collections of Java objects.
|
||||
* <p>
|
||||
* The resulting LSH is a 64 bit value, whose hamming distance is a measure
|
||||
* of the similarity of the two collections, where smaller similarities imply
|
||||
* similarity.
|
||||
* <p>
|
||||
* It hinges on a lot of relatively sketchy assumptions about Object$hashCode().
|
||||
*
|
||||
*/
|
||||
public class EasyLSH {
|
||||
// This parameter determines the amount of shingling. Increasing this
|
||||
// increases the sensitivity to word ordering, but also magnifies
|
||||
// the impact of each difference overall.
|
||||
//
|
||||
// This must be a power of 2, lest things break
|
||||
private static final int SHINGLING = 2;
|
||||
static { assert Integer.bitCount(SHINGLING) == 1; }
|
||||
|
||||
|
||||
|
||||
private final int[] fields = new int[64];
|
||||
private final int[] prevHashes = new int[SHINGLING];
|
||||
private int prevHashIdx = 0;
|
||||
|
||||
public void addUnordered(Object o) {
|
||||
addHashUnordered(o.hashCode());
|
||||
}
|
||||
|
||||
public void addOrdered(Object o) {
|
||||
addHashOrdered(o.hashCode());
|
||||
}
|
||||
|
||||
public void addHashOrdered(int hashCode) {
|
||||
hashCode = shingleHash(hashCode);
|
||||
addHashUnordered(shingleHash(hashCode));
|
||||
}
|
||||
|
||||
public void addHashUnordered(int hashCode) {
|
||||
int value = 1-(hashCode & 2);
|
||||
|
||||
// Try to extract all the remaining entropy
|
||||
// into selecting the field to update
|
||||
|
||||
int field = 63 & (((hashCode >>> 2)
|
||||
^ (hashCode >>> 10)
|
||||
^ (hashCode >>> 18)
|
||||
^ (hashCode >>> 26)));
|
||||
|
||||
fields[field] += value;
|
||||
}
|
||||
|
||||
private int shingleHash(int nextHash) {
|
||||
prevHashes[prevHashIdx++ & (SHINGLING-1)] = nextHash;
|
||||
int ret = 0;
|
||||
for (int hashPart : prevHashes) {
|
||||
ret ^= hashPart;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public long get() {
|
||||
long val = 0;
|
||||
|
||||
for (int f : fields) {
|
||||
val = (val << 1) | (f >>> 31);
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
public int hammingDistance(EasyLSH other) {
|
||||
return hammingDistance(this, other);
|
||||
}
|
||||
|
||||
public static int hammingDistance(long a, long b) {
|
||||
return Long.bitCount(a^b);
|
||||
}
|
||||
|
||||
public static int hammingDistance(EasyLSH a, EasyLSH b) {
|
||||
int distance = 0;
|
||||
|
||||
for (int i = 0; i < a.fields.length; i++) {
|
||||
distance += (a.fields[i] ^ b.fields[i]) >>> 31;
|
||||
}
|
||||
|
||||
return distance;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,77 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
class EasyLSHTest {
|
||||
|
||||
@Test
|
||||
public void testEZLSH() {
|
||||
String sA = """
|
||||
In computer science, locality-sensitive hashing (LSH) is an algorithmic technique that hashes similar input items
|
||||
into the same "buckets" with high probability.[1] (The number of buckets is much smaller than the universe of possible
|
||||
input items.)[1] Since similar items end up in the same buckets, this technique can be used for data clustering and
|
||||
nearest neighbor search. It differs from conventional hashing techniques in that hash collisions are maximized, not minimized.
|
||||
Alternatively, the technique can be seen as a way to reduce the dimensionality of high-dimensional data; high-dimensional input
|
||||
items can be reduced to low-dimensional versions while preserving relative distances between items.;
|
||||
""";
|
||||
|
||||
String sB = """
|
||||
In computer science, locality-sensitive hashing (LSH) is an algorithmic technique that hashes similar input items
|
||||
into the same "buckets" with high probability.[1] (The number of buckets is much smaller than the universe of possible
|
||||
input items.)[1] Since similar items end up in the same buckets, this technique can be used for data clustering and
|
||||
nearest neighbor search.
|
||||
|
||||
The wrath sing, goddess, of Peleus' son, Achilles, that destructive wrath which brought countless woes upon the Achaeans,
|
||||
and sent forth to Hades many valiant souls of heroes, and made them themselves spoil for dogs and every bird; thus the plan
|
||||
of Zeus came to fulfillment, [5] from the time when1 first they parted in strife Atreus' son, king of men, and brilliant Achilles.
|
||||
Who then of the gods was it that brought these two together to contend?
|
||||
|
||||
It differs from conventional hashing techniques in that hash collisions are maximized, not minimized.
|
||||
Alternatively, the technique can be seen as a way to reduce the dimensionality of high-dimensional data; high-dimensional input
|
||||
items can be reduced to low-dimensional versions while preserving relative distances between items.;
|
||||
""";
|
||||
|
||||
String sC = """
|
||||
The wrath sing, goddess, of Peleus' son, Achilles, that destructive wrath which brought countless woes upon the Achaeans,
|
||||
and sent forth to Hades many valiant souls of heroes, and made them themselves spoil for dogs and every bird; thus the plan
|
||||
of Zeus came to fulfillment, [5] from the time when1 first they parted in strife Atreus' son, king of men, and brilliant Achilles.
|
||||
Who then of the gods was it that brought these two together to contend?
|
||||
""";
|
||||
|
||||
String sD = """
|
||||
Quo usque tandem abutere, Catilina, patientia nostra? quam diu etiam furor iste tuus nos eludet? quem ad finem sese effrenata iactabit
|
||||
audacia? Nihilne te nocturnum praesidium Palati, nihil urbis vigiliae, nihil timor populi, nihil concursus bonorum omnium, nihil hic
|
||||
munitissimus habendi senatus locus, nihil horum ora voltusque moverunt? Patere tua consilia non sentis, constrictam iam horum omnium
|
||||
scientia teneri coniurationem tuam non vides? Quid proxima, quid superiore nocte egeris, ubi fueris, quos convocaveris, quid consilii
|
||||
ceperis, quem nostrum ignorare arbitraris? [2] O tempora, o mores! Senatus haec intellegit. Consul videt; hic tamen vivit. Vivit? immo
|
||||
vero etiam in senatum venit, fit publici consilii particeps, notat et designat oculis ad caedem unum quemque nostrum. Nos autem fortes
|
||||
viri satis facere rei publicae videmur, si istius furorem ac tela vitemus. Ad mortem te, Catilina, duci iussu consulis iam pridem oportebat,
|
||||
in te conferri pestem, quam tu in nos [omnes iam diu] machinaris.
|
||||
""";
|
||||
|
||||
EasyLSH hashA = new EasyLSH();
|
||||
Arrays.stream(sA.split("\\s")).forEach(hashA::addOrdered);
|
||||
EasyLSH hashB = new EasyLSH();
|
||||
Arrays.stream(sB.split("\\s")).forEach(hashB::addOrdered);
|
||||
EasyLSH hashC = new EasyLSH();
|
||||
Arrays.stream(sC.split("\\s")).forEach(hashC::addOrdered);
|
||||
|
||||
EasyLSH hashD = new EasyLSH();
|
||||
Arrays.stream(sD.split("\\s")).forEach(hashD::addOrdered);
|
||||
|
||||
System.out.println(Long.toBinaryString(hashA.get()));
|
||||
System.out.println(Long.toBinaryString(hashB.get()));
|
||||
System.out.println(Long.toBinaryString(hashC.get()));
|
||||
System.out.println(Long.toBinaryString(hashD.get()));
|
||||
|
||||
System.out.println(EasyLSH.hammingDistance(hashA, hashB));
|
||||
System.out.println(EasyLSH.hammingDistance(hashB, hashC));
|
||||
System.out.println(EasyLSH.hammingDistance(hashA, hashC));
|
||||
|
||||
System.out.println(EasyLSH.hammingDistance(hashA, hashD));
|
||||
System.out.println(EasyLSH.hammingDistance(hashB, hashD));
|
||||
System.out.println(EasyLSH.hammingDistance(hashC, hashD));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user