Skip to main content

argus_dedupe/
simhash.rs

1use std::collections::hash_map::DefaultHasher;
2use std::hash::{Hash, Hasher};
3
4const SIMHASH_BITS: usize = 64;
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
7pub struct Simhash(u64);
8
9impl Simhash {
10    pub fn new(value: u64) -> Self {
11        Self(value)
12    }
13
14    pub fn from_text(text: &str) -> Self {
15        let tokens = tokenize(text);
16        Self::from_tokens(&tokens)
17    }
18
19    pub fn from_tokens(tokens: &[String]) -> Self {
20        let mut v = vec![0i32; SIMHASH_BITS];
21
22        for token in tokens {
23            let hash = hash_token(token);
24            for (i, count) in v.iter_mut().enumerate().take(SIMHASH_BITS) {
25                if (hash >> i) & 1 == 1 {
26                    *count += 1;
27                } else {
28                    *count -= 1;
29                }
30            }
31        }
32
33        let mut fingerprint: u64 = 0;
34        for (i, &count) in v.iter().enumerate().take(SIMHASH_BITS) {
35            if count > 0 {
36                fingerprint |= 1 << i;
37            }
38        }
39
40        Self(fingerprint)
41    }
42
43    pub fn hamming_distance(&self, other: &Simhash) -> u32 {
44        (self.0 ^ other.0).count_ones()
45    }
46
47    pub fn similarity(&self, other: &Simhash) -> f64 {
48        let distance = self.hamming_distance(other);
49        1.0 - (distance as f64 / SIMHASH_BITS as f64)
50    }
51
52    pub fn is_near_duplicate(&self, other: &Simhash, threshold: u32) -> bool {
53        self.hamming_distance(other) <= threshold
54    }
55
56    pub fn value(&self) -> u64 {
57        self.0
58    }
59}
60
61fn tokenize(text: &str) -> Vec<String> {
62    text.split_whitespace()
63        .filter(|s| s.len() >= 3)
64        .map(|s| s.to_lowercase())
65        .collect()
66}
67
68fn hash_token(token: &str) -> u64 {
69    let mut hasher = DefaultHasher::new();
70    token.hash(&mut hasher);
71    hasher.finish()
72}
73
74#[cfg(test)]
75mod tests {
76    use super::*;
77
78    #[test]
79    fn identical_texts_have_zero_distance() {
80        let text = "The quick brown fox jumps over the lazy dog";
81        let hash1 = Simhash::from_text(text);
82        let hash2 = Simhash::from_text(text);
83        assert_eq!(hash1.hamming_distance(&hash2), 0);
84        assert_eq!(hash1.similarity(&hash2), 1.0);
85    }
86
87    #[test]
88    fn similar_texts_have_small_distance() {
89        let text1 = "The quick brown fox jumps over the lazy dog";
90        let text2 = "The quick brown fox jumps over a lazy dog";
91        let hash1 = Simhash::from_text(text1);
92        let hash2 = Simhash::from_text(text2);
93        let distance = hash1.hamming_distance(&hash2);
94        assert!(distance < 10, "Distance should be small: {}", distance);
95        assert!(hash1.similarity(&hash2) > 0.8);
96    }
97
98    #[test]
99    fn different_texts_have_large_distance() {
100        let text1 = "The quick brown fox jumps over the lazy dog";
101        let text2 = "Python is a programming language";
102        let hash1 = Simhash::from_text(text1);
103        let hash2 = Simhash::from_text(text2);
104        let distance = hash1.hamming_distance(&hash2);
105        assert!(distance > 20, "Distance should be large: {}", distance);
106        assert!(hash1.similarity(&hash2) < 0.7);
107    }
108
109    #[test]
110    fn near_duplicate_detection() {
111        let text1 = "This is a test document with some content";
112        let text2 = "This is a test document with similar content";
113        let text3 = "Completely different text about something else";
114
115        let hash1 = Simhash::from_text(text1);
116        let hash2 = Simhash::from_text(text2);
117        let hash3 = Simhash::from_text(text3);
118
119        let distance_similar = hash1.hamming_distance(&hash2);
120        let distance_different = hash1.hamming_distance(&hash3);
121
122        assert!(
123            distance_similar < distance_different,
124            "Similar texts should have smaller distance: {} vs {}",
125            distance_similar,
126            distance_different
127        );
128        assert!(hash1.is_near_duplicate(&hash2, 15));
129        assert!(!hash1.is_near_duplicate(&hash3, 15));
130    }
131
132    #[test]
133    fn empty_text_handling() {
134        let hash1 = Simhash::from_text("");
135        let hash2 = Simhash::from_text("");
136        assert_eq!(hash1.hamming_distance(&hash2), 0);
137    }
138
139    #[test]
140    fn case_insensitive() {
141        let text1 = "The Quick Brown Fox";
142        let text2 = "the quick brown fox";
143        let hash1 = Simhash::from_text(text1);
144        let hash2 = Simhash::from_text(text2);
145        assert_eq!(hash1.hamming_distance(&hash2), 0);
146    }
147}