mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Simple implementation of a locality-sensitive hash for text word.
This commit is contained in:
parent
ff30de7352
commit
b5805063e0
94
marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java
Normal file
94
marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/** This is a very simple locality sensitive hash for collections of Java objects.
|
||||||
|
* <p>
|
||||||
|
* The resulting LSH is a 64 bit value, whose hamming distance is a measure
|
||||||
|
* of the similarity of the two collections, where smaller similarities imply
|
||||||
|
* similarity.
|
||||||
|
* <p>
|
||||||
|
* It hinges on a lot of relatively sketchy assumptions about Object$hashCode().
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class EasyLSH {
|
||||||
|
// This parameter determines the amount of shingling. Increasing this
|
||||||
|
// increases the sensitivity to word ordering, but also magnifies
|
||||||
|
// the impact of each difference overall.
|
||||||
|
//
|
||||||
|
// This must be a power of 2, lest things break
|
||||||
|
private static final int SHINGLING = 2;
|
||||||
|
static { assert Integer.bitCount(SHINGLING) == 1; }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private final int[] fields = new int[64];
|
||||||
|
private final int[] prevHashes = new int[SHINGLING];
|
||||||
|
private int prevHashIdx = 0;
|
||||||
|
|
||||||
|
public void addUnordered(Object o) {
|
||||||
|
addHashUnordered(o.hashCode());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addOrdered(Object o) {
|
||||||
|
addHashOrdered(o.hashCode());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addHashOrdered(int hashCode) {
|
||||||
|
hashCode = shingleHash(hashCode);
|
||||||
|
addHashUnordered(shingleHash(hashCode));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addHashUnordered(int hashCode) {
|
||||||
|
int value = 1-(hashCode & 2);
|
||||||
|
|
||||||
|
// Try to extract all the remaining entropy
|
||||||
|
// into selecting the field to update
|
||||||
|
|
||||||
|
int field = 63 & (((hashCode >>> 2)
|
||||||
|
^ (hashCode >>> 10)
|
||||||
|
^ (hashCode >>> 18)
|
||||||
|
^ (hashCode >>> 26)));
|
||||||
|
|
||||||
|
fields[field] += value;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int shingleHash(int nextHash) {
|
||||||
|
prevHashes[prevHashIdx++ & (SHINGLING-1)] = nextHash;
|
||||||
|
int ret = 0;
|
||||||
|
for (int hashPart : prevHashes) {
|
||||||
|
ret ^= hashPart;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long get() {
|
||||||
|
long val = 0;
|
||||||
|
|
||||||
|
for (int f : fields) {
|
||||||
|
val = (val << 1) | (f >>> 31);
|
||||||
|
}
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hammingDistance(EasyLSH other) {
|
||||||
|
return hammingDistance(this, other);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int hammingDistance(long a, long b) {
|
||||||
|
return Long.bitCount(a^b);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int hammingDistance(EasyLSH a, EasyLSH b) {
|
||||||
|
int distance = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < a.fields.length; i++) {
|
||||||
|
distance += (a.fields[i] ^ b.fields[i]) >>> 31;
|
||||||
|
}
|
||||||
|
|
||||||
|
return distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,77 @@
|
|||||||
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
class EasyLSHTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEZLSH() {
|
||||||
|
String sA = """
|
||||||
|
In computer science, locality-sensitive hashing (LSH) is an algorithmic technique that hashes similar input items
|
||||||
|
into the same "buckets" with high probability.[1] (The number of buckets is much smaller than the universe of possible
|
||||||
|
input items.)[1] Since similar items end up in the same buckets, this technique can be used for data clustering and
|
||||||
|
nearest neighbor search. It differs from conventional hashing techniques in that hash collisions are maximized, not minimized.
|
||||||
|
Alternatively, the technique can be seen as a way to reduce the dimensionality of high-dimensional data; high-dimensional input
|
||||||
|
items can be reduced to low-dimensional versions while preserving relative distances between items.;
|
||||||
|
""";
|
||||||
|
|
||||||
|
String sB = """
|
||||||
|
In computer science, locality-sensitive hashing (LSH) is an algorithmic technique that hashes similar input items
|
||||||
|
into the same "buckets" with high probability.[1] (The number of buckets is much smaller than the universe of possible
|
||||||
|
input items.)[1] Since similar items end up in the same buckets, this technique can be used for data clustering and
|
||||||
|
nearest neighbor search.
|
||||||
|
|
||||||
|
The wrath sing, goddess, of Peleus' son, Achilles, that destructive wrath which brought countless woes upon the Achaeans,
|
||||||
|
and sent forth to Hades many valiant souls of heroes, and made them themselves spoil for dogs and every bird; thus the plan
|
||||||
|
of Zeus came to fulfillment, [5] from the time when1 first they parted in strife Atreus' son, king of men, and brilliant Achilles.
|
||||||
|
Who then of the gods was it that brought these two together to contend?
|
||||||
|
|
||||||
|
It differs from conventional hashing techniques in that hash collisions are maximized, not minimized.
|
||||||
|
Alternatively, the technique can be seen as a way to reduce the dimensionality of high-dimensional data; high-dimensional input
|
||||||
|
items can be reduced to low-dimensional versions while preserving relative distances between items.;
|
||||||
|
""";
|
||||||
|
|
||||||
|
String sC = """
|
||||||
|
The wrath sing, goddess, of Peleus' son, Achilles, that destructive wrath which brought countless woes upon the Achaeans,
|
||||||
|
and sent forth to Hades many valiant souls of heroes, and made them themselves spoil for dogs and every bird; thus the plan
|
||||||
|
of Zeus came to fulfillment, [5] from the time when1 first they parted in strife Atreus' son, king of men, and brilliant Achilles.
|
||||||
|
Who then of the gods was it that brought these two together to contend?
|
||||||
|
""";
|
||||||
|
|
||||||
|
String sD = """
|
||||||
|
Quo usque tandem abutere, Catilina, patientia nostra? quam diu etiam furor iste tuus nos eludet? quem ad finem sese effrenata iactabit
|
||||||
|
audacia? Nihilne te nocturnum praesidium Palati, nihil urbis vigiliae, nihil timor populi, nihil concursus bonorum omnium, nihil hic
|
||||||
|
munitissimus habendi senatus locus, nihil horum ora voltusque moverunt? Patere tua consilia non sentis, constrictam iam horum omnium
|
||||||
|
scientia teneri coniurationem tuam non vides? Quid proxima, quid superiore nocte egeris, ubi fueris, quos convocaveris, quid consilii
|
||||||
|
ceperis, quem nostrum ignorare arbitraris? [2] O tempora, o mores! Senatus haec intellegit. Consul videt; hic tamen vivit. Vivit? immo
|
||||||
|
vero etiam in senatum venit, fit publici consilii particeps, notat et designat oculis ad caedem unum quemque nostrum. Nos autem fortes
|
||||||
|
viri satis facere rei publicae videmur, si istius furorem ac tela vitemus. Ad mortem te, Catilina, duci iussu consulis iam pridem oportebat,
|
||||||
|
in te conferri pestem, quam tu in nos [omnes iam diu] machinaris.
|
||||||
|
""";
|
||||||
|
|
||||||
|
EasyLSH hashA = new EasyLSH();
|
||||||
|
Arrays.stream(sA.split("\\s")).forEach(hashA::addOrdered);
|
||||||
|
EasyLSH hashB = new EasyLSH();
|
||||||
|
Arrays.stream(sB.split("\\s")).forEach(hashB::addOrdered);
|
||||||
|
EasyLSH hashC = new EasyLSH();
|
||||||
|
Arrays.stream(sC.split("\\s")).forEach(hashC::addOrdered);
|
||||||
|
|
||||||
|
EasyLSH hashD = new EasyLSH();
|
||||||
|
Arrays.stream(sD.split("\\s")).forEach(hashD::addOrdered);
|
||||||
|
|
||||||
|
System.out.println(Long.toBinaryString(hashA.get()));
|
||||||
|
System.out.println(Long.toBinaryString(hashB.get()));
|
||||||
|
System.out.println(Long.toBinaryString(hashC.get()));
|
||||||
|
System.out.println(Long.toBinaryString(hashD.get()));
|
||||||
|
|
||||||
|
System.out.println(EasyLSH.hammingDistance(hashA, hashB));
|
||||||
|
System.out.println(EasyLSH.hammingDistance(hashB, hashC));
|
||||||
|
System.out.println(EasyLSH.hammingDistance(hashA, hashC));
|
||||||
|
|
||||||
|
System.out.println(EasyLSH.hammingDistance(hashA, hashD));
|
||||||
|
System.out.println(EasyLSH.hammingDistance(hashB, hashD));
|
||||||
|
System.out.println(EasyLSH.hammingDistance(hashC, hashD));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user