Skip to main content

iscc_lib/
simhash.rs

1//! SimHash algorithm and sliding window utilities.
2//!
3//! Implements the SimHash similarity-preserving hash and sliding window
4//! n-gram generation, ported from `iscc-core` `simhash.py` and `utils.py`.
5
6use crate::{IsccError, IsccResult};
7use std::cmp;
8
9/// Compute a SimHash from a sequence of equal-length hash digests.
10///
11/// Validates that all digests have equal length, then delegates to
12/// `alg_simhash_inner`. Returns `Err(IsccError::InvalidInput)` if
13/// digest lengths are mismatched.
14pub fn alg_simhash(hash_digests: &[impl AsRef<[u8]>]) -> IsccResult<Vec<u8>> {
15    if hash_digests.len() >= 2 {
16        let expected_len = hash_digests[0].as_ref().len();
17        for (i, digest) in hash_digests.iter().enumerate().skip(1) {
18            if digest.as_ref().len() != expected_len {
19                return Err(IsccError::InvalidInput(format!(
20                    "All hash digests must have equal length (expected {}, got {} at index {})",
21                    expected_len,
22                    digest.as_ref().len(),
23                    i
24                )));
25            }
26        }
27    }
28    Ok(alg_simhash_inner(hash_digests))
29}
30
31/// Compute a SimHash from a sequence of equal-length hash digests (unchecked).
32///
33/// Internal variant that skips length validation. Callers must guarantee
34/// all digests have equal length. Returns 32 zero bytes for empty input.
35pub(crate) fn alg_simhash_inner(hash_digests: &[impl AsRef<[u8]>]) -> Vec<u8> {
36    if hash_digests.is_empty() {
37        return vec![0u8; 32];
38    }
39
40    let n_bytes = hash_digests[0].as_ref().len();
41    let n_bits = n_bytes * 8;
42    let mut vector = vec![0u32; n_bits];
43
44    for digest in hash_digests {
45        let bytes = digest.as_ref();
46        for (i, count) in vector.iter_mut().enumerate() {
47            let byte_idx = i / 8;
48            let bit_idx = 7 - (i % 8); // MSB first
49            if (bytes[byte_idx] >> bit_idx) & 1 == 1 {
50                *count += 1;
51            }
52        }
53    }
54
55    // Threshold: count * 2 >= n matches Python's count >= n / 2.0
56    let n = hash_digests.len() as u32;
57    let mut result = vec![0u8; n_bytes];
58    for (i, &count) in vector.iter().enumerate() {
59        if count * 2 >= n {
60            let byte_idx = i / 8;
61            let bit_idx = 7 - (i % 8);
62            result[byte_idx] |= 1 << bit_idx;
63        }
64    }
65
66    result
67}
68
69/// Generate sliding window n-grams from a string.
70///
71/// Returns overlapping substrings of `width` Unicode characters, advancing
72/// by one character at a time. If the input is shorter than `width`, returns
73/// a single element containing the full input.
74///
75/// Returns `Err(IsccError::InvalidInput)` if `width < 2`.
76pub fn sliding_window(seq: &str, width: usize) -> IsccResult<Vec<String>> {
77    if width < 2 {
78        return Err(IsccError::InvalidInput(
79            "Sliding window width must be 2 or bigger.".into(),
80        ));
81    }
82    let chars: Vec<char> = seq.chars().collect();
83    let len = chars.len();
84    let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
85    Ok((0..range)
86        .map(|i| {
87            let end = cmp::min(i + width, len);
88            chars[i..end].iter().collect()
89        })
90        .collect())
91}
92
93/// Generate sliding window n-grams as borrowed string slices.
94///
95/// Returns overlapping substrings of `width` Unicode characters as `&str`
96/// slices borrowing from the input, avoiding per-n-gram `String` allocation.
97/// If the input is shorter than `width`, returns a single slice of the full input.
98pub(crate) fn sliding_window_strs(seq: &str, width: usize) -> Vec<&str> {
99    debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
100    let char_indices: Vec<usize> = seq.char_indices().map(|(i, _)| i).collect();
101    let len = char_indices.len();
102    if len == 0 {
103        return vec![seq]; // empty string → single empty slice
104    }
105    let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
106    (0..range)
107        .map(|i| {
108            let start = char_indices[i];
109            let end = if i + width >= len {
110                seq.len()
111            } else {
112                char_indices[i + width]
113            };
114            &seq[start..end]
115        })
116        .collect()
117}
118
119/// Generate sliding window n-grams from a byte slice.
120///
121/// Returns overlapping sub-slices of `width` bytes, advancing by one byte
122/// at a time. If the input is shorter than `width`, returns a single slice
123/// of the full input.
124pub(crate) fn sliding_window_bytes(data: &[u8], width: usize) -> Vec<&[u8]> {
125    debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
126    let len = data.len();
127    let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
128    (0..range)
129        .map(|i| {
130            let end = cmp::min(i + width, len);
131            &data[i..end]
132        })
133        .collect()
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139
140    #[test]
141    fn test_sliding_window_basic() {
142        assert_eq!(
143            sliding_window("Hello", 4).unwrap(),
144            vec!["Hell".to_string(), "ello".to_string()]
145        );
146    }
147
148    #[test]
149    fn test_sliding_window_shorter_than_width() {
150        assert_eq!(sliding_window("ab", 3).unwrap(), vec!["ab".to_string()]);
151    }
152
153    #[test]
154    fn test_sliding_window_exact_width() {
155        assert_eq!(sliding_window("abc", 3).unwrap(), vec!["abc".to_string()]);
156    }
157
158    #[test]
159    fn test_sliding_window_empty() {
160        assert_eq!(sliding_window("", 3).unwrap(), vec!["".to_string()]);
161    }
162
163    #[test]
164    fn test_sliding_window_unicode() {
165        assert_eq!(
166            sliding_window("äöü", 2).unwrap(),
167            vec!["äö".to_string(), "öü".to_string()]
168        );
169    }
170
171    #[test]
172    fn test_alg_simhash_single_digest() {
173        let digest = blake3::hash(b"test");
174        let result = alg_simhash(&[digest.as_bytes().to_vec()]).unwrap();
175        assert_eq!(result, digest.as_bytes().to_vec());
176    }
177
178    #[test]
179    fn test_alg_simhash_empty() {
180        let empty: Vec<Vec<u8>> = vec![];
181        let result = alg_simhash(&empty).unwrap();
182        assert_eq!(result, vec![0u8; 32]);
183    }
184
185    #[test]
186    fn test_alg_simhash_identical_digests() {
187        let digest = vec![0xFFu8; 32];
188        let result = alg_simhash(&[digest.clone(), digest.clone(), digest]).unwrap();
189        assert_eq!(result, vec![0xFFu8; 32]);
190    }
191
192    #[test]
193    fn test_alg_simhash_opposite_digests() {
194        // Two digests: all-ones and all-zeros. Threshold is count/2 = 1.
195        // Each bit has count 1 (from all-ones) or 0 (from all-zeros).
196        // vector[i] * 2 >= 2 → vector[i] >= 1. Bits from all-ones survive.
197        let ones = vec![0xFFu8; 32];
198        let zeros = vec![0x00u8; 32];
199        let result = alg_simhash(&[ones, zeros]).unwrap();
200        assert_eq!(result, vec![0xFFu8; 32]);
201    }
202
203    #[test]
204    fn test_alg_simhash_mismatched_lengths_returns_error() {
205        // Mismatched digest lengths should return Err, not panic
206        let result = alg_simhash(&[vec![1u8, 2], vec![1u8, 2, 3]]);
207        assert!(result.is_err());
208        let msg = result.unwrap_err().to_string();
209        assert!(
210            msg.contains("equal length"),
211            "error message should mention equal length, got: {msg}"
212        );
213    }
214
215    #[test]
216    fn test_sliding_window_width_too_small() {
217        let result = sliding_window("test", 1);
218        assert!(result.is_err());
219        let msg = result.unwrap_err().to_string();
220        assert!(
221            msg.contains("width must be 2"),
222            "error message should mention width constraint, got: {msg}"
223        );
224    }
225
226    // ---- sliding_window_strs tests ----
227
228    #[test]
229    fn test_sliding_window_strs_basic() {
230        assert_eq!(sliding_window_strs("Hello", 4), vec!["Hell", "ello"]);
231    }
232
233    #[test]
234    fn test_sliding_window_strs_shorter_than_width() {
235        assert_eq!(sliding_window_strs("ab", 3), vec!["ab"]);
236    }
237
238    #[test]
239    fn test_sliding_window_strs_exact_width() {
240        assert_eq!(sliding_window_strs("abc", 3), vec!["abc"]);
241    }
242
243    #[test]
244    fn test_sliding_window_strs_empty() {
245        assert_eq!(sliding_window_strs("", 3), vec![""]);
246    }
247
248    #[test]
249    fn test_sliding_window_strs_unicode() {
250        assert_eq!(sliding_window_strs("äöü", 2), vec!["äö", "öü"]);
251    }
252
253    #[cfg(debug_assertions)]
254    #[test]
255    #[should_panic(expected = "width must be 2")]
256    fn test_sliding_window_strs_width_too_small() {
257        sliding_window_strs("test", 1);
258    }
259
260    #[test]
261    fn test_sliding_window_strs_matches_sliding_window() {
262        // Verify that sliding_window_strs produces identical content to sliding_window
263        let cases = vec![
264            ("Hello World", 4),
265            ("ab", 3),
266            ("abc", 3),
267            ("", 3),
268            ("äöü", 2),
269            ("Hello", 13),
270            ("a longer test string for n-grams", 5),
271        ];
272        for (input, width) in cases {
273            let strings = sliding_window(input, width).unwrap();
274            let strs = sliding_window_strs(input, width);
275            assert_eq!(
276                strings.len(),
277                strs.len(),
278                "length mismatch for input={input:?} width={width}"
279            );
280            for (s, sr) in strings.iter().zip(strs.iter()) {
281                assert_eq!(
282                    s.as_str(),
283                    *sr,
284                    "content mismatch for input={input:?} width={width}"
285                );
286            }
287        }
288    }
289
290    // ---- sliding_window_bytes tests ----
291
292    #[test]
293    fn test_sliding_window_bytes_basic() {
294        assert_eq!(
295            sliding_window_bytes(b"Hello", 4),
296            vec![&b"Hell"[..], &b"ello"[..]]
297        );
298    }
299
300    #[test]
301    fn test_sliding_window_bytes_shorter_than_width() {
302        assert_eq!(sliding_window_bytes(b"ab", 3), vec![&b"ab"[..]]);
303    }
304
305    #[test]
306    fn test_sliding_window_bytes_exact_width() {
307        assert_eq!(sliding_window_bytes(b"abc", 3), vec![&b"abc"[..]]);
308    }
309
310    #[test]
311    fn test_sliding_window_bytes_empty() {
312        assert_eq!(sliding_window_bytes(b"", 3), vec![&b""[..]]);
313    }
314
315    #[test]
316    fn test_sliding_window_bytes_width_4() {
317        assert_eq!(
318            sliding_window_bytes(b"abcdef", 4),
319            vec![&b"abcd"[..], &b"bcde"[..], &b"cdef"[..]]
320        );
321    }
322
323    #[cfg(debug_assertions)]
324    #[test]
325    #[should_panic(expected = "width must be 2")]
326    fn test_sliding_window_bytes_width_too_small() {
327        sliding_window_bytes(b"test", 1);
328    }
329}