pub fn simhash(text: &str) -> u64 {
if text.is_empty() {
return 0;
}
let text_lower = text.to_lowercase();
let features = extract_features(&text_lower);
if features.is_empty() {
return hash_string(text);
}
let mut v = [0i32; 64];
for feature in features {
let hash = hash_string(&feature);
#[allow(clippy::needless_range_loop)]
for i in 0..64 {
let bit = (hash >> i) & 1;
if bit == 1 {
v[i] += 1;
} else {
v[i] -= 1;
}
}
}
let mut fingerprint: u64 = 0;
#[allow(clippy::needless_range_loop)]
for i in 0..64 {
if v[i] > 0 {
fingerprint |= 1u64 << i;
}
}
fingerprint
}
pub fn simhash_distance(hash1: u64, hash2: u64) -> u32 {
(hash1 ^ hash2).count_ones()
}
pub fn simhash_similarity(a: &str, b: &str) -> f64 {
let hash1 = simhash(a);
let hash2 = simhash(b);
let distance = simhash_distance(hash1, hash2);
1.0 - (distance as f64 / 64.0)
}
pub fn find_similar_hashes(
query_hash: u64,
candidate_hashes: &[u64],
max_distance: u32,
) -> Vec<usize> {
candidate_hashes
.iter()
.enumerate()
.filter_map(|(i, &hash)| {
let distance = simhash_distance(query_hash, hash);
if distance <= max_distance {
Some(i)
} else {
None
}
})
.collect()
}
fn extract_features(text: &str) -> Vec<String> {
let chars: Vec<char> = text.chars().collect();
let mut features = Vec::new();
if chars.len() <= 2 {
for c in chars {
features.push(c.to_string());
}
return features;
}
for window in chars.windows(3) {
let trigram: String = window.iter().collect();
features.push(trigram);
}
for token in text.split_whitespace() {
if !token.is_empty() {
features.push(token.to_string());
}
}
features
}
fn hash_string(s: &str) -> u64 {
const FNV_OFFSET_BASIS: u64 = 14695981039346656037;
const FNV_PRIME: u64 = 1099511628211;
let mut hash = FNV_OFFSET_BASIS;
for byte in s.as_bytes() {
hash ^= *byte as u64;
hash = hash.wrapping_mul(FNV_PRIME);
}
hash
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simhash_identical() {
let hash1 = simhash("hello world");
let hash2 = simhash("hello world");
assert_eq!(hash1, hash2);
}
#[test]
fn test_simhash_similar() {
let hash1 = simhash("iPhone 14");
let hash2 = simhash("iPhone 15");
let hash3 = simhash("Galaxy S23");
let dist_similar = simhash_distance(hash1, hash2);
let dist_different = simhash_distance(hash1, hash3);
assert!(dist_similar < dist_different);
assert!(dist_similar < 20); }
#[test]
fn test_simhash_similarity() {
let sim_identical = simhash_similarity("test", "test");
assert_eq!(sim_identical, 1.0);
let sim_similar = simhash_similarity("hello world", "hello world!");
assert!(sim_similar > 0.7);
let sim_different = simhash_similarity("apple", "orange");
assert!(sim_different < 0.8); }
#[test]
fn test_simhash_distance() {
let hash1 = simhash("test");
let hash2 = simhash("test");
assert_eq!(simhash_distance(hash1, hash2), 0);
}
#[test]
fn test_find_similar_hashes() {
let candidates = vec!["iPhone 14 Pro", "iPhone 13", "Galaxy S23", "iPhone 14"];
let hashes: Vec<u64> = candidates.iter().map(|s| simhash(s)).collect();
let query_hash = simhash("iPhone 14");
let matches = find_similar_hashes(query_hash, &hashes, 15);
assert!(matches.len() >= 2);
assert!(matches.contains(&3)); }
#[test]
fn test_empty_string() {
let hash = simhash("");
assert_eq!(hash, 0);
}
#[test]
fn test_case_insensitive() {
let hash1 = simhash("Hello World");
let hash2 = simhash("hello world");
assert_eq!(hash1, hash2);
}
#[test]
fn test_feature_extraction() {
let features = extract_features("abc");
assert!(features.len() > 0);
let features = extract_features("hello world");
assert!(features.len() > 0);
}
}