Skip to main content

iscc_lib/
simhash.rs

1//! SimHash algorithm and sliding window utilities.
2//!
3//! Implements the SimHash similarity-preserving hash and sliding window
4//! n-gram generation, ported from `iscc-core` `simhash.py` and `utils.py`.
5
6use crate::{IsccError, IsccResult};
7use std::cmp;
8
9/// Compute a SimHash from a sequence of equal-length hash digests.
10///
11/// Validates that all digests have equal length, then delegates to
12/// `alg_simhash_inner`. Returns `Err(IsccError::InvalidInput)` if
13/// digest lengths are mismatched.
14pub fn alg_simhash(hash_digests: &[impl AsRef<[u8]>]) -> IsccResult<Vec<u8>> {
15    if hash_digests.len() >= 2 {
16        let expected_len = hash_digests[0].as_ref().len();
17        for (i, digest) in hash_digests.iter().enumerate().skip(1) {
18            if digest.as_ref().len() != expected_len {
19                return Err(IsccError::InvalidInput(format!(
20                    "All hash digests must have equal length (expected {}, got {} at index {})",
21                    expected_len,
22                    digest.as_ref().len(),
23                    i
24                )));
25            }
26        }
27    }
28    Ok(alg_simhash_inner(hash_digests))
29}
30
31/// Compute a SimHash from a sequence of equal-length hash digests (unchecked).
32///
33/// Internal variant that skips length validation. Callers must guarantee
34/// all digests have equal length. Returns 32 zero bytes for empty input.
35pub(crate) fn alg_simhash_inner(hash_digests: &[impl AsRef<[u8]>]) -> Vec<u8> {
36    if hash_digests.is_empty() {
37        return vec![0u8; 32];
38    }
39
40    let n_bytes = hash_digests[0].as_ref().len();
41    let n_bits = n_bytes * 8;
42    let mut vector = vec![0u32; n_bits];
43
44    for digest in hash_digests {
45        let bytes = digest.as_ref();
46        for (i, count) in vector.iter_mut().enumerate() {
47            let byte_idx = i / 8;
48            let bit_idx = 7 - (i % 8); // MSB first
49            if (bytes[byte_idx] >> bit_idx) & 1 == 1 {
50                *count += 1;
51            }
52        }
53    }
54
55    // Threshold: count * 2 >= n matches Python's count >= n / 2.0
56    let n = hash_digests.len() as u32;
57    let mut result = vec![0u8; n_bytes];
58    for (i, &count) in vector.iter().enumerate() {
59        if count * 2 >= n {
60            let byte_idx = i / 8;
61            let bit_idx = 7 - (i % 8);
62            result[byte_idx] |= 1 << bit_idx;
63        }
64    }
65
66    result
67}
68
69/// Generate sliding window n-grams from a string.
70///
71/// Returns overlapping substrings of `width` Unicode characters, advancing
72/// by one character at a time. If the input is shorter than `width`, returns
73/// a single element containing the full input.
74///
75/// Returns `Err(IsccError::InvalidInput)` if `width < 2`.
76pub fn sliding_window(seq: &str, width: usize) -> IsccResult<Vec<String>> {
77    if width < 2 {
78        return Err(IsccError::InvalidInput(
79            "Sliding window width must be 2 or bigger.".into(),
80        ));
81    }
82    let chars: Vec<char> = seq.chars().collect();
83    let len = chars.len();
84    let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
85    Ok((0..range)
86        .map(|i| {
87            let end = cmp::min(i + width, len);
88            chars[i..end].iter().collect()
89        })
90        .collect())
91}
92
93/// Generate sliding window n-grams as borrowed string slices.
94///
95/// Returns overlapping substrings of `width` Unicode characters as `&str`
96/// slices borrowing from the input, avoiding per-n-gram `String` allocation.
97/// If the input is shorter than `width`, returns a single slice of the full input.
98#[cfg(feature = "text-processing")]
99pub(crate) fn sliding_window_strs(seq: &str, width: usize) -> Vec<&str> {
100    debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
101    let char_indices: Vec<usize> = seq.char_indices().map(|(i, _)| i).collect();
102    let len = char_indices.len();
103    if len == 0 {
104        return vec![seq]; // empty string → single empty slice
105    }
106    let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
107    (0..range)
108        .map(|i| {
109            let start = char_indices[i];
110            let end = if i + width >= len {
111                seq.len()
112            } else {
113                char_indices[i + width]
114            };
115            &seq[start..end]
116        })
117        .collect()
118}
119
120/// Generate sliding window n-grams from a byte slice.
121///
122/// Returns overlapping sub-slices of `width` bytes, advancing by one byte
123/// at a time. If the input is shorter than `width`, returns a single slice
124/// of the full input.
125#[cfg(feature = "meta-code")]
126pub(crate) fn sliding_window_bytes(data: &[u8], width: usize) -> Vec<&[u8]> {
127    debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
128    let len = data.len();
129    let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
130    (0..range)
131        .map(|i| {
132            let end = cmp::min(i + width, len);
133            &data[i..end]
134        })
135        .collect()
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    #[test]
143    fn test_sliding_window_basic() {
144        assert_eq!(
145            sliding_window("Hello", 4).unwrap(),
146            vec!["Hell".to_string(), "ello".to_string()]
147        );
148    }
149
150    #[test]
151    fn test_sliding_window_shorter_than_width() {
152        assert_eq!(sliding_window("ab", 3).unwrap(), vec!["ab".to_string()]);
153    }
154
155    #[test]
156    fn test_sliding_window_exact_width() {
157        assert_eq!(sliding_window("abc", 3).unwrap(), vec!["abc".to_string()]);
158    }
159
160    #[test]
161    fn test_sliding_window_empty() {
162        assert_eq!(sliding_window("", 3).unwrap(), vec!["".to_string()]);
163    }
164
165    #[test]
166    fn test_sliding_window_unicode() {
167        assert_eq!(
168            sliding_window("äöü", 2).unwrap(),
169            vec!["äö".to_string(), "öü".to_string()]
170        );
171    }
172
173    #[test]
174    fn test_alg_simhash_single_digest() {
175        let digest = blake3::hash(b"test");
176        let result = alg_simhash(&[digest.as_bytes().to_vec()]).unwrap();
177        assert_eq!(result, digest.as_bytes().to_vec());
178    }
179
180    #[test]
181    fn test_alg_simhash_empty() {
182        let empty: Vec<Vec<u8>> = vec![];
183        let result = alg_simhash(&empty).unwrap();
184        assert_eq!(result, vec![0u8; 32]);
185    }
186
187    #[test]
188    fn test_alg_simhash_identical_digests() {
189        let digest = vec![0xFFu8; 32];
190        let result = alg_simhash(&[digest.clone(), digest.clone(), digest]).unwrap();
191        assert_eq!(result, vec![0xFFu8; 32]);
192    }
193
194    #[test]
195    fn test_alg_simhash_opposite_digests() {
196        // Two digests: all-ones and all-zeros. Threshold is count/2 = 1.
197        // Each bit has count 1 (from all-ones) or 0 (from all-zeros).
198        // vector[i] * 2 >= 2 → vector[i] >= 1. Bits from all-ones survive.
199        let ones = vec![0xFFu8; 32];
200        let zeros = vec![0x00u8; 32];
201        let result = alg_simhash(&[ones, zeros]).unwrap();
202        assert_eq!(result, vec![0xFFu8; 32]);
203    }
204
205    #[test]
206    fn test_alg_simhash_mismatched_lengths_returns_error() {
207        // Mismatched digest lengths should return Err, not panic
208        let result = alg_simhash(&[vec![1u8, 2], vec![1u8, 2, 3]]);
209        assert!(result.is_err());
210        let msg = result.unwrap_err().to_string();
211        assert!(
212            msg.contains("equal length"),
213            "error message should mention equal length, got: {msg}"
214        );
215    }
216
217    #[test]
218    fn test_sliding_window_width_too_small() {
219        let result = sliding_window("test", 1);
220        assert!(result.is_err());
221        let msg = result.unwrap_err().to_string();
222        assert!(
223            msg.contains("width must be 2"),
224            "error message should mention width constraint, got: {msg}"
225        );
226    }
227
228    // ---- sliding_window_strs tests ----
229
230    #[cfg(feature = "text-processing")]
231    #[test]
232    fn test_sliding_window_strs_basic() {
233        assert_eq!(sliding_window_strs("Hello", 4), vec!["Hell", "ello"]);
234    }
235
236    #[cfg(feature = "text-processing")]
237    #[test]
238    fn test_sliding_window_strs_shorter_than_width() {
239        assert_eq!(sliding_window_strs("ab", 3), vec!["ab"]);
240    }
241
242    #[cfg(feature = "text-processing")]
243    #[test]
244    fn test_sliding_window_strs_exact_width() {
245        assert_eq!(sliding_window_strs("abc", 3), vec!["abc"]);
246    }
247
248    #[cfg(feature = "text-processing")]
249    #[test]
250    fn test_sliding_window_strs_empty() {
251        assert_eq!(sliding_window_strs("", 3), vec![""]);
252    }
253
254    #[cfg(feature = "text-processing")]
255    #[test]
256    fn test_sliding_window_strs_unicode() {
257        assert_eq!(sliding_window_strs("äöü", 2), vec!["äö", "öü"]);
258    }
259
260    #[cfg(feature = "text-processing")]
261    #[cfg(debug_assertions)]
262    #[test]
263    #[should_panic(expected = "width must be 2")]
264    fn test_sliding_window_strs_width_too_small() {
265        sliding_window_strs("test", 1);
266    }
267
268    #[cfg(feature = "text-processing")]
269    #[test]
270    fn test_sliding_window_strs_matches_sliding_window() {
271        // Verify that sliding_window_strs produces identical content to sliding_window
272        let cases = vec![
273            ("Hello World", 4),
274            ("ab", 3),
275            ("abc", 3),
276            ("", 3),
277            ("äöü", 2),
278            ("Hello", 13),
279            ("a longer test string for n-grams", 5),
280        ];
281        for (input, width) in cases {
282            let strings = sliding_window(input, width).unwrap();
283            let strs = sliding_window_strs(input, width);
284            assert_eq!(
285                strings.len(),
286                strs.len(),
287                "length mismatch for input={input:?} width={width}"
288            );
289            for (s, sr) in strings.iter().zip(strs.iter()) {
290                assert_eq!(
291                    s.as_str(),
292                    *sr,
293                    "content mismatch for input={input:?} width={width}"
294                );
295            }
296        }
297    }
298
299    // ---- sliding_window_bytes tests ----
300
301    #[cfg(feature = "meta-code")]
302    #[test]
303    fn test_sliding_window_bytes_basic() {
304        assert_eq!(
305            sliding_window_bytes(b"Hello", 4),
306            vec![&b"Hell"[..], &b"ello"[..]]
307        );
308    }
309
310    #[cfg(feature = "meta-code")]
311    #[test]
312    fn test_sliding_window_bytes_shorter_than_width() {
313        assert_eq!(sliding_window_bytes(b"ab", 3), vec![&b"ab"[..]]);
314    }
315
316    #[cfg(feature = "meta-code")]
317    #[test]
318    fn test_sliding_window_bytes_exact_width() {
319        assert_eq!(sliding_window_bytes(b"abc", 3), vec![&b"abc"[..]]);
320    }
321
322    #[cfg(feature = "meta-code")]
323    #[test]
324    fn test_sliding_window_bytes_empty() {
325        assert_eq!(sliding_window_bytes(b"", 3), vec![&b""[..]]);
326    }
327
328    #[cfg(feature = "meta-code")]
329    #[test]
330    fn test_sliding_window_bytes_width_4() {
331        assert_eq!(
332            sliding_window_bytes(b"abcdef", 4),
333            vec![&b"abcd"[..], &b"bcde"[..], &b"cdef"[..]]
334        );
335    }
336
337    #[cfg(feature = "meta-code")]
338    #[cfg(debug_assertions)]
339    #[test]
340    #[should_panic(expected = "width must be 2")]
341    fn test_sliding_window_bytes_width_too_small() {
342        sliding_window_bytes(b"test", 1);
343    }
344}