use std::collections::HashSet;
use std::hash::{Hash, Hasher};
#[derive(Debug, Clone)]
pub struct MinHash {
num_hashes: usize,
seeds: Vec<u64>,
}
impl MinHash {
pub fn new(num_hashes: usize) -> Self {
Self::with_seed(num_hashes, 42)
}
pub fn with_seed(num_hashes: usize, seed: u64) -> Self {
let mut seeds = Vec::with_capacity(num_hashes);
let mut rng_state = seed;
for _ in 0..num_hashes {
rng_state = rng_state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
seeds.push(rng_state);
}
Self { num_hashes, seeds }
}
pub fn signature<T: Hash>(&self, items: &HashSet<T>) -> MinHashSignature {
let mut mins = vec![u64::MAX; self.num_hashes];
for item in items {
for (i, &seed) in self.seeds.iter().enumerate() {
let hash = self.hash_with_seed(item, seed);
if hash < mins[i] {
mins[i] = hash;
}
}
}
MinHashSignature { values: mins }
}
pub fn signature_from_iter<T: Hash, I: IntoIterator<Item = T>>(
&self,
items: I,
) -> MinHashSignature {
let mut mins = vec![u64::MAX; self.num_hashes];
for item in items {
for (i, &seed) in self.seeds.iter().enumerate() {
let hash = self.hash_with_seed(&item, seed);
if hash < mins[i] {
mins[i] = hash;
}
}
}
MinHashSignature { values: mins }
}
fn hash_with_seed<T: Hash>(&self, item: &T, seed: u64) -> u64 {
use std::collections::hash_map::DefaultHasher;
let mut hasher = DefaultHasher::new();
seed.hash(&mut hasher);
item.hash(&mut hasher);
hasher.finish()
}
pub fn num_hashes(&self) -> usize {
self.num_hashes
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MinHashSignature {
pub values: Vec<u64>,
}
impl MinHashSignature {
pub fn jaccard(&self, other: &MinHashSignature) -> f64 {
if self.values.len() != other.values.len() {
return 0.0;
}
let matches = self
.values
.iter()
.zip(other.values.iter())
.filter(|(a, b)| a == b)
.count();
matches as f64 / self.values.len() as f64
}
pub fn is_similar(&self, other: &MinHashSignature, threshold: f64) -> bool {
self.jaccard(other) >= threshold
}
pub fn hamming_distance(&self, other: &MinHashSignature) -> usize {
self.values
.iter()
.zip(other.values.iter())
.filter(|(a, b)| a != b)
.count()
}
pub fn merge(&self, other: &MinHashSignature) -> MinHashSignature {
let values = self
.values
.iter()
.zip(other.values.iter())
.map(|(&a, &b)| a.min(b))
.collect();
MinHashSignature { values }
}
pub fn len(&self) -> usize {
self.values.len()
}
pub fn is_empty(&self) -> bool {
self.values.is_empty()
}
}
#[derive(Debug)]
pub struct MinHashLSH {
bands: usize,
rows_per_band: usize,
buckets: Vec<std::collections::HashMap<u64, Vec<usize>>>,
signatures: Vec<MinHashSignature>,
}
impl MinHashLSH {
pub fn new(bands: usize, rows_per_band: usize) -> Self {
Self {
bands,
rows_per_band,
buckets: (0..bands)
.map(|_| std::collections::HashMap::new())
.collect(),
signatures: Vec::new(),
}
}
pub fn with_threshold(num_hashes: usize, threshold: f64) -> Self {
let mut best_bands = 1;
let mut best_error = f64::MAX;
for b in 1..=num_hashes {
if num_hashes % b == 0 {
let r = num_hashes / b;
let t = (1.0 / b as f64).powf(1.0 / r as f64);
let error = (t - threshold).abs();
if error < best_error {
best_error = error;
best_bands = b;
}
}
}
let rows = num_hashes / best_bands;
Self::new(best_bands, rows)
}
pub fn insert(&mut self, signature: MinHashSignature) -> usize {
let doc_id = self.signatures.len();
for (band_idx, chunk) in signature.values.chunks(self.rows_per_band).enumerate() {
if band_idx >= self.bands {
break;
}
let band_hash = self.hash_band(chunk);
self.buckets[band_idx]
.entry(band_hash)
.or_default()
.push(doc_id);
}
self.signatures.push(signature);
doc_id
}
pub fn query(&self, signature: &MinHashSignature) -> Vec<usize> {
let mut candidates: HashSet<usize> = HashSet::new();
for (band_idx, chunk) in signature.values.chunks(self.rows_per_band).enumerate() {
if band_idx >= self.bands {
break;
}
let band_hash = self.hash_band(chunk);
if let Some(docs) = self.buckets[band_idx].get(&band_hash) {
candidates.extend(docs.iter().copied());
}
}
candidates.into_iter().collect()
}
pub fn query_with_similarity(&self, signature: &MinHashSignature) -> Vec<(usize, f64)> {
let candidates = self.query(signature);
let mut results: Vec<(usize, f64)> = candidates
.into_iter()
.map(|id| {
let sim = signature.jaccard(&self.signatures[id]);
(id, sim)
})
.collect();
results.sort_by(|a, b| b.1.total_cmp(&a.1));
results
}
fn hash_band(&self, values: &[u64]) -> u64 {
use std::collections::hash_map::DefaultHasher;
let mut hasher = DefaultHasher::new();
for v in values {
v.hash(&mut hasher);
}
hasher.finish()
}
pub fn len(&self) -> usize {
self.signatures.len()
}
pub fn is_empty(&self) -> bool {
self.signatures.is_empty()
}
pub fn threshold(&self) -> f64 {
(1.0 / self.bands as f64).powf(1.0 / self.rows_per_band as f64)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_minhash_identical_sets() {
let mh = MinHash::new(128);
let set: HashSet<&str> = ["a", "b", "c"].into_iter().collect();
let sig1 = mh.signature(&set);
let sig2 = mh.signature(&set);
assert_eq!(sig1.jaccard(&sig2), 1.0);
}
#[test]
fn test_minhash_disjoint_sets() {
let mh = MinHash::new(128);
let set1: HashSet<&str> = ["a", "b", "c"].into_iter().collect();
let set2: HashSet<&str> = ["x", "y", "z"].into_iter().collect();
let sig1 = mh.signature(&set1);
let sig2 = mh.signature(&set2);
assert!(sig1.jaccard(&sig2) < 0.2);
}
#[test]
fn test_minhash_similar_sets() {
let mh = MinHash::new(256);
let set1: HashSet<i32> = (0..100).collect();
let set2: HashSet<i32> = (50..150).collect();
let sig1 = mh.signature(&set1);
let sig2 = mh.signature(&set2);
let estimated = sig1.jaccard(&sig2);
assert!((estimated - 0.333).abs() < 0.1);
}
#[test]
fn test_minhash_lsh() {
let mh = MinHash::new(100);
let mut lsh = MinHashLSH::new(20, 5);
let doc1: HashSet<&str> = ["the", "quick", "brown", "fox"].into_iter().collect();
let doc2: HashSet<&str> = ["the", "quick", "brown", "dog"].into_iter().collect();
let doc3: HashSet<&str> = ["hello", "world", "foo", "bar"].into_iter().collect();
let sig1 = mh.signature(&doc1);
let sig2 = mh.signature(&doc2);
let sig3 = mh.signature(&doc3);
lsh.insert(sig1);
lsh.insert(sig2);
lsh.insert(sig3.clone());
let results = lsh.query_with_similarity(&mh.signature(&doc2));
assert!(!results.is_empty());
}
#[test]
fn test_signature_merge() {
let mh = MinHash::new(64);
let set1: HashSet<&str> = ["a", "b"].into_iter().collect();
let set2: HashSet<&str> = ["c", "d"].into_iter().collect();
let union: HashSet<&str> = ["a", "b", "c", "d"].into_iter().collect();
let sig1 = mh.signature(&set1);
let sig2 = mh.signature(&set2);
let sig_union = mh.signature(&union);
let sig_merged = sig1.merge(&sig2);
assert_eq!(sig_merged, sig_union);
}
}