2024-03-19 13:28:42 +00:00
|
|
|
package nu.marginalia.segmentation;
|
2024-03-12 12:12:50 +00:00
|
|
|
|
|
|
|
import nu.marginalia.hash.MurmurHash3_128;
|
|
|
|
|
|
|
|
/** A group of hash functions that can be used to hash a sequence of strings,
|
|
|
|
* that also has an inverse operation that can be used to remove a previously applied
|
|
|
|
* string from the sequence. */
|
2024-03-19 13:28:42 +00:00
|
|
|
public sealed interface HasherGroup {
|
2024-03-12 12:12:50 +00:00
|
|
|
/** Apply a hash to the accumulator */
|
|
|
|
long apply(long acc, long add);
|
|
|
|
|
|
|
|
/** Remove a hash that was added n operations ago from the accumulator, add a new one */
|
|
|
|
long replace(long acc, long add, long rem, int n);
|
|
|
|
|
|
|
|
/** Create a new hasher group that preserves the order of appleid hash functions */
|
|
|
|
static HasherGroup ordered() {
|
|
|
|
return new OrderedHasher();
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Create a new hasher group that does not preserve the order of applied hash functions */
|
|
|
|
static HasherGroup unordered() {
|
|
|
|
return new UnorderedHasher();
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Bake the words in the sentence into a hash successively using the group's apply function */
|
|
|
|
default long rollingHash(String[] parts) {
|
|
|
|
long code = 0;
|
|
|
|
for (String part : parts) {
|
|
|
|
code = apply(code, hash(part));
|
|
|
|
}
|
|
|
|
return code;
|
|
|
|
}
|
|
|
|
|
|
|
|
MurmurHash3_128 hash = new MurmurHash3_128();
|
|
|
|
/** Calculate the hash of a string */
|
|
|
|
static long hash(String term) {
|
|
|
|
return hash.hashNearlyASCII(term);
|
|
|
|
}
|
|
|
|
|
|
|
|
final class UnorderedHasher implements HasherGroup {
|
|
|
|
|
|
|
|
public long apply(long acc, long add) {
|
|
|
|
return acc ^ add;
|
|
|
|
}
|
|
|
|
|
|
|
|
public long replace(long acc, long add, long rem, int n) {
|
|
|
|
return acc ^ rem ^ add;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
final class OrderedHasher implements HasherGroup {
|
|
|
|
|
|
|
|
public long apply(long acc, long add) {
|
|
|
|
return Long.rotateLeft(acc, 1) ^ add;
|
|
|
|
}
|
|
|
|
|
|
|
|
public long replace(long acc, long add, long rem, int n) {
|
|
|
|
return Long.rotateLeft(acc, 1) ^ add ^ Long.rotateLeft(rem, n);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|