1use std::collections::hash_map::DefaultHasher;
2use std::hash::{Hash, Hasher};
3
4const SIMHASH_BITS: usize = 64;
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
7pub struct Simhash(u64);
8
9impl Simhash {
10 pub fn new(value: u64) -> Self {
11 Self(value)
12 }
13
14 pub fn from_text(text: &str) -> Self {
15 let tokens = tokenize(text);
16 Self::from_tokens(&tokens)
17 }
18
19 pub fn from_tokens(tokens: &[String]) -> Self {
20 let mut v = vec![0i32; SIMHASH_BITS];
21
22 for token in tokens {
23 let hash = hash_token(token);
24 for (i, count) in v.iter_mut().enumerate().take(SIMHASH_BITS) {
25 if (hash >> i) & 1 == 1 {
26 *count += 1;
27 } else {
28 *count -= 1;
29 }
30 }
31 }
32
33 let mut fingerprint: u64 = 0;
34 for (i, &count) in v.iter().enumerate().take(SIMHASH_BITS) {
35 if count > 0 {
36 fingerprint |= 1 << i;
37 }
38 }
39
40 Self(fingerprint)
41 }
42
43 pub fn hamming_distance(&self, other: &Simhash) -> u32 {
44 (self.0 ^ other.0).count_ones()
45 }
46
47 pub fn similarity(&self, other: &Simhash) -> f64 {
48 let distance = self.hamming_distance(other);
49 1.0 - (distance as f64 / SIMHASH_BITS as f64)
50 }
51
52 pub fn is_near_duplicate(&self, other: &Simhash, threshold: u32) -> bool {
53 self.hamming_distance(other) <= threshold
54 }
55
56 pub fn value(&self) -> u64 {
57 self.0
58 }
59}
60
61fn tokenize(text: &str) -> Vec<String> {
62 text.split_whitespace()
63 .filter(|s| s.len() >= 3)
64 .map(|s| s.to_lowercase())
65 .collect()
66}
67
68fn hash_token(token: &str) -> u64 {
69 let mut hasher = DefaultHasher::new();
70 token.hash(&mut hasher);
71 hasher.finish()
72}
73
74#[cfg(test)]
75mod tests {
76 use super::*;
77
78 #[test]
79 fn identical_texts_have_zero_distance() {
80 let text = "The quick brown fox jumps over the lazy dog";
81 let hash1 = Simhash::from_text(text);
82 let hash2 = Simhash::from_text(text);
83 assert_eq!(hash1.hamming_distance(&hash2), 0);
84 assert_eq!(hash1.similarity(&hash2), 1.0);
85 }
86
87 #[test]
88 fn similar_texts_have_small_distance() {
89 let text1 = "The quick brown fox jumps over the lazy dog";
90 let text2 = "The quick brown fox jumps over a lazy dog";
91 let hash1 = Simhash::from_text(text1);
92 let hash2 = Simhash::from_text(text2);
93 let distance = hash1.hamming_distance(&hash2);
94 assert!(distance < 10, "Distance should be small: {}", distance);
95 assert!(hash1.similarity(&hash2) > 0.8);
96 }
97
98 #[test]
99 fn different_texts_have_large_distance() {
100 let text1 = "The quick brown fox jumps over the lazy dog";
101 let text2 = "Python is a programming language";
102 let hash1 = Simhash::from_text(text1);
103 let hash2 = Simhash::from_text(text2);
104 let distance = hash1.hamming_distance(&hash2);
105 assert!(distance > 20, "Distance should be large: {}", distance);
106 assert!(hash1.similarity(&hash2) < 0.7);
107 }
108
109 #[test]
110 fn near_duplicate_detection() {
111 let text1 = "This is a test document with some content";
112 let text2 = "This is a test document with similar content";
113 let text3 = "Completely different text about something else";
114
115 let hash1 = Simhash::from_text(text1);
116 let hash2 = Simhash::from_text(text2);
117 let hash3 = Simhash::from_text(text3);
118
119 let distance_similar = hash1.hamming_distance(&hash2);
120 let distance_different = hash1.hamming_distance(&hash3);
121
122 assert!(
123 distance_similar < distance_different,
124 "Similar texts should have smaller distance: {} vs {}",
125 distance_similar,
126 distance_different
127 );
128 assert!(hash1.is_near_duplicate(&hash2, 15));
129 assert!(!hash1.is_near_duplicate(&hash3, 15));
130 }
131
132 #[test]
133 fn empty_text_handling() {
134 let hash1 = Simhash::from_text("");
135 let hash2 = Simhash::from_text("");
136 assert_eq!(hash1.hamming_distance(&hash2), 0);
137 }
138
139 #[test]
140 fn case_insensitive() {
141 let text1 = "The Quick Brown Fox";
142 let text2 = "the quick brown fox";
143 let hash1 = Simhash::from_text(text1);
144 let hash2 = Simhash::from_text(text2);
145 assert_eq!(hash1.hamming_distance(&hash2), 0);
146 }
147}