1use crate::{IsccError, IsccResult};
7use std::cmp;
8
9pub fn alg_simhash(hash_digests: &[impl AsRef<[u8]>]) -> IsccResult<Vec<u8>> {
15 if hash_digests.len() >= 2 {
16 let expected_len = hash_digests[0].as_ref().len();
17 for (i, digest) in hash_digests.iter().enumerate().skip(1) {
18 if digest.as_ref().len() != expected_len {
19 return Err(IsccError::InvalidInput(format!(
20 "All hash digests must have equal length (expected {}, got {} at index {})",
21 expected_len,
22 digest.as_ref().len(),
23 i
24 )));
25 }
26 }
27 }
28 Ok(alg_simhash_inner(hash_digests))
29}
30
31pub(crate) fn alg_simhash_inner(hash_digests: &[impl AsRef<[u8]>]) -> Vec<u8> {
36 if hash_digests.is_empty() {
37 return vec![0u8; 32];
38 }
39
40 let n_bytes = hash_digests[0].as_ref().len();
41 let n_bits = n_bytes * 8;
42 let mut vector = vec![0u32; n_bits];
43
44 for digest in hash_digests {
45 let bytes = digest.as_ref();
46 for (i, count) in vector.iter_mut().enumerate() {
47 let byte_idx = i / 8;
48 let bit_idx = 7 - (i % 8); if (bytes[byte_idx] >> bit_idx) & 1 == 1 {
50 *count += 1;
51 }
52 }
53 }
54
55 let n = hash_digests.len() as u32;
57 let mut result = vec![0u8; n_bytes];
58 for (i, &count) in vector.iter().enumerate() {
59 if count * 2 >= n {
60 let byte_idx = i / 8;
61 let bit_idx = 7 - (i % 8);
62 result[byte_idx] |= 1 << bit_idx;
63 }
64 }
65
66 result
67}
68
69pub fn sliding_window(seq: &str, width: usize) -> IsccResult<Vec<String>> {
77 if width < 2 {
78 return Err(IsccError::InvalidInput(
79 "Sliding window width must be 2 or bigger.".into(),
80 ));
81 }
82 let chars: Vec<char> = seq.chars().collect();
83 let len = chars.len();
84 let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
85 Ok((0..range)
86 .map(|i| {
87 let end = cmp::min(i + width, len);
88 chars[i..end].iter().collect()
89 })
90 .collect())
91}
92
93pub(crate) fn sliding_window_strs(seq: &str, width: usize) -> Vec<&str> {
99 debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
100 let char_indices: Vec<usize> = seq.char_indices().map(|(i, _)| i).collect();
101 let len = char_indices.len();
102 if len == 0 {
103 return vec![seq]; }
105 let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
106 (0..range)
107 .map(|i| {
108 let start = char_indices[i];
109 let end = if i + width >= len {
110 seq.len()
111 } else {
112 char_indices[i + width]
113 };
114 &seq[start..end]
115 })
116 .collect()
117}
118
119pub(crate) fn sliding_window_bytes(data: &[u8], width: usize) -> Vec<&[u8]> {
125 debug_assert!(width >= 2, "Sliding window width must be 2 or bigger.");
126 let len = data.len();
127 let range = cmp::max(len.saturating_sub(width).saturating_add(1), 1);
128 (0..range)
129 .map(|i| {
130 let end = cmp::min(i + width, len);
131 &data[i..end]
132 })
133 .collect()
134}
135
136#[cfg(test)]
137mod tests {
138 use super::*;
139
140 #[test]
141 fn test_sliding_window_basic() {
142 assert_eq!(
143 sliding_window("Hello", 4).unwrap(),
144 vec!["Hell".to_string(), "ello".to_string()]
145 );
146 }
147
148 #[test]
149 fn test_sliding_window_shorter_than_width() {
150 assert_eq!(sliding_window("ab", 3).unwrap(), vec!["ab".to_string()]);
151 }
152
153 #[test]
154 fn test_sliding_window_exact_width() {
155 assert_eq!(sliding_window("abc", 3).unwrap(), vec!["abc".to_string()]);
156 }
157
158 #[test]
159 fn test_sliding_window_empty() {
160 assert_eq!(sliding_window("", 3).unwrap(), vec!["".to_string()]);
161 }
162
163 #[test]
164 fn test_sliding_window_unicode() {
165 assert_eq!(
166 sliding_window("äöü", 2).unwrap(),
167 vec!["äö".to_string(), "öü".to_string()]
168 );
169 }
170
171 #[test]
172 fn test_alg_simhash_single_digest() {
173 let digest = blake3::hash(b"test");
174 let result = alg_simhash(&[digest.as_bytes().to_vec()]).unwrap();
175 assert_eq!(result, digest.as_bytes().to_vec());
176 }
177
178 #[test]
179 fn test_alg_simhash_empty() {
180 let empty: Vec<Vec<u8>> = vec![];
181 let result = alg_simhash(&empty).unwrap();
182 assert_eq!(result, vec![0u8; 32]);
183 }
184
185 #[test]
186 fn test_alg_simhash_identical_digests() {
187 let digest = vec![0xFFu8; 32];
188 let result = alg_simhash(&[digest.clone(), digest.clone(), digest]).unwrap();
189 assert_eq!(result, vec![0xFFu8; 32]);
190 }
191
192 #[test]
193 fn test_alg_simhash_opposite_digests() {
194 let ones = vec![0xFFu8; 32];
198 let zeros = vec![0x00u8; 32];
199 let result = alg_simhash(&[ones, zeros]).unwrap();
200 assert_eq!(result, vec![0xFFu8; 32]);
201 }
202
203 #[test]
204 fn test_alg_simhash_mismatched_lengths_returns_error() {
205 let result = alg_simhash(&[vec![1u8, 2], vec![1u8, 2, 3]]);
207 assert!(result.is_err());
208 let msg = result.unwrap_err().to_string();
209 assert!(
210 msg.contains("equal length"),
211 "error message should mention equal length, got: {msg}"
212 );
213 }
214
215 #[test]
216 fn test_sliding_window_width_too_small() {
217 let result = sliding_window("test", 1);
218 assert!(result.is_err());
219 let msg = result.unwrap_err().to_string();
220 assert!(
221 msg.contains("width must be 2"),
222 "error message should mention width constraint, got: {msg}"
223 );
224 }
225
226 #[test]
229 fn test_sliding_window_strs_basic() {
230 assert_eq!(sliding_window_strs("Hello", 4), vec!["Hell", "ello"]);
231 }
232
233 #[test]
234 fn test_sliding_window_strs_shorter_than_width() {
235 assert_eq!(sliding_window_strs("ab", 3), vec!["ab"]);
236 }
237
238 #[test]
239 fn test_sliding_window_strs_exact_width() {
240 assert_eq!(sliding_window_strs("abc", 3), vec!["abc"]);
241 }
242
243 #[test]
244 fn test_sliding_window_strs_empty() {
245 assert_eq!(sliding_window_strs("", 3), vec![""]);
246 }
247
248 #[test]
249 fn test_sliding_window_strs_unicode() {
250 assert_eq!(sliding_window_strs("äöü", 2), vec!["äö", "öü"]);
251 }
252
253 #[cfg(debug_assertions)]
254 #[test]
255 #[should_panic(expected = "width must be 2")]
256 fn test_sliding_window_strs_width_too_small() {
257 sliding_window_strs("test", 1);
258 }
259
260 #[test]
261 fn test_sliding_window_strs_matches_sliding_window() {
262 let cases = vec![
264 ("Hello World", 4),
265 ("ab", 3),
266 ("abc", 3),
267 ("", 3),
268 ("äöü", 2),
269 ("Hello", 13),
270 ("a longer test string for n-grams", 5),
271 ];
272 for (input, width) in cases {
273 let strings = sliding_window(input, width).unwrap();
274 let strs = sliding_window_strs(input, width);
275 assert_eq!(
276 strings.len(),
277 strs.len(),
278 "length mismatch for input={input:?} width={width}"
279 );
280 for (s, sr) in strings.iter().zip(strs.iter()) {
281 assert_eq!(
282 s.as_str(),
283 *sr,
284 "content mismatch for input={input:?} width={width}"
285 );
286 }
287 }
288 }
289
290 #[test]
293 fn test_sliding_window_bytes_basic() {
294 assert_eq!(
295 sliding_window_bytes(b"Hello", 4),
296 vec![&b"Hell"[..], &b"ello"[..]]
297 );
298 }
299
300 #[test]
301 fn test_sliding_window_bytes_shorter_than_width() {
302 assert_eq!(sliding_window_bytes(b"ab", 3), vec![&b"ab"[..]]);
303 }
304
305 #[test]
306 fn test_sliding_window_bytes_exact_width() {
307 assert_eq!(sliding_window_bytes(b"abc", 3), vec![&b"abc"[..]]);
308 }
309
310 #[test]
311 fn test_sliding_window_bytes_empty() {
312 assert_eq!(sliding_window_bytes(b"", 3), vec![&b""[..]]);
313 }
314
315 #[test]
316 fn test_sliding_window_bytes_width_4() {
317 assert_eq!(
318 sliding_window_bytes(b"abcdef", 4),
319 vec![&b"abcd"[..], &b"bcde"[..], &b"cdef"[..]]
320 );
321 }
322
323 #[cfg(debug_assertions)]
324 #[test]
325 #[should_panic(expected = "width must be 2")]
326 fn test_sliding_window_bytes_width_too_small() {
327 sliding_window_bytes(b"test", 1);
328 }
329}